In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import scipy.cluster.hierarchy as sch # to build the dendrogram and build the plotting
from sklearn.cluster import AgglomerativeClustering,KMeans,DBSCAN

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score


plt.rcParams['figure.figsize']=(15,10)
plt.rcParams['figure.dpi']=300
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# load the DataSet
df=pd.read_excel("World_development_mesurement.xlsx")
df

# **Exploratory Data Analysis**

In [None]:
df["GDP"]=df["GDP"].str.replace("$","").str.replace(",","").astype(float)

In [None]:
df["Health Exp/Capita"]=df["Health Exp/Capita"].str.replace("$","").str.replace(",","").astype(float)

In [None]:
df["Tourism Inbound"]=df["Tourism Inbound"].str.replace("$","").str.replace(",","").astype(float)

In [None]:
df["Tourism Outbound"]=df["Tourism Outbound"].str.replace("$","").str.replace(",","").astype(float)

In [None]:
df['Business Tax Rate'] = df['Business Tax Rate'].str.replace('%', '').str.replace(',', '').astype(float)

In [None]:
df

In [None]:
df.describe()

In [None]:
df.describe(include=object)

In [None]:
OR=OrdinalEncoder()
df["Country"]=OR.fit_transform(df[["Country"]])

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.dtypes


In [None]:
df["Business Tax Rate"].value_counts()

In [None]:
df["GDP"].value_counts()

In [None]:
df["GDP"].unique()

In [None]:
df["Tourism Inbound"].value_counts()

In [None]:
df["Tourism Outbound"].value_counts()

In [None]:
df["Health Exp % GDP"].value_counts()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
for i in df.isna().sum():
  print(i/len(df)*100)

In [None]:
df.drop(columns=["Ease of Business","Number of Records"],inplace=True)

In [None]:
df

In [None]:
sns.heatmap(df.isna(),cmap="Blues")
plt.tight_layout()

In [None]:
df.isnull().sum()

In [None]:
for i in df.isna().sum():
  print(i/len(df)*100)

To replace the missing values check for Outliers



*   For numerical columns having outliers replace them with the median.
*   For numerical columns doesn't having the outliers replace them with the mean.
*   for categorical column replace missing values by mode






In [None]:
for column in df.select_dtypes(include=np.number).columns:
  sns.boxplot(y=df[column], palette="rainbow")
  plt.title(f"Boxplot of {column}")
  plt.tight_layout()
  plt.show()

In [None]:
for column in df.select_dtypes(include=np.number).columns:
  sns.histplot(x=df[column], palette="teal")
  plt.title(f"histogram of {column}")
  plt.tight_layout()
  plt.show()

In [None]:
df.hist(figsize=(15,10),bins=20,color="skyblue") # removed color_palette, which is not a valid argument for df.hist
plt.tight_layout()
plt.show()


In [None]:
df.columns


In [None]:
df["Business Tax Rate"].fillna(df["Business Tax Rate"].median(),inplace=True)

df["CO2 Emissions"].fillna(df["CO2 Emissions"].median(),inplace=True)

df["Days to Start Business"].fillna(df["Days to Start Business"].median(),inplace=True)

df["Energy Usage"].fillna(df["Energy Usage"].median(),inplace=True)

df["GDP"].fillna(df["GDP"].median(),inplace=True)

df["Health Exp % GDP"].fillna(df["Health Exp % GDP"].median(),inplace=True)

df["Health Exp/Capita"].fillna(df["Health Exp/Capita"].median(),inplace=True)

df["Hours to do Tax"].fillna(df["Hours to do Tax"].median(),inplace=True)

df["Infant Mortality Rate"].fillna(df["Infant Mortality Rate"].median(),inplace=True)

df["Internet Usage"].fillna(df["Internet Usage"].mean(),inplace=True)

df["Lending Interest"].fillna(df["Lending Interest"].median(),inplace=True)

df["Life Expectancy Female"].fillna(df["Life Expectancy Female"].median(),inplace=True)

df["Life Expectancy Male"].fillna(df["Life Expectancy Male"].median(),inplace=True)

df["Mobile Phone Usage"].fillna(df["Mobile Phone Usage"].median(),inplace=True)

df["Population 0-14"].fillna(df["Population 0-14"].mean(),inplace=True)

df["Population 15-64"].fillna(df["Population 15-64"].median(),inplace=True)

df["Population 65+"].fillna(df["Population 65+"].median(),inplace=True)

df["Tourism Inbound"].fillna(df["Tourism Inbound"].median(),inplace=True)

df["Tourism Outbound"].fillna(df["Tourism Outbound"].median(),inplace=True)


In [None]:
df["Country"].value_counts()

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

to treat the outliers

  capping replacing outlier values is called capping

  In capping all outlier values will be replaced by upper extreame or lower extreame

  Outliesr detection : user defined function to calculate upper extreame and lower extreame

In [None]:
df.describe()

In [None]:
sns.boxplot(df,palette="rainbow")
plt.tight_layout()

In [None]:
for column in df.select_dtypes(include=np.number).columns:
  sns.boxplot(y=df[column])
  plt.title(f"Boxplot of {column}")
  plt.tight_layout()
  plt.show()

In [None]:
df.hist(figsize=(15,10),bins=20,color="red")
plt.tight_layout()
plt.show()

In [None]:
def outlier_detection(data,colname):
  q1=data[colname].quantile(0.25)
  q2=data[colname].quantile(0.75)

  iqr=q2-q1
  upper_extreame=q2+(1.5*iqr)
  lower_extreame=q1-(1.5*iqr)
  return lower_extreame,upper_extreame,q1,q2


In [None]:
for column in df.select_dtypes(include=np.number).columns:
  print(outlier_detection(df,column))

In [None]:
df.columns


In [None]:
df.loc[df["Business Tax Rate"]>47.825,"Business Tax Rate"]=47.825
df.loc[df["Business Tax Rate"]<34.825,"Business Tax Rate"]=34.825

df.loc[df["CO2 Emissions"]>113119.0,"CO2 Emissions"]=113119.0

df.loc[df["Days to Start Business"]>57.5,"Days to Start Business"]=57.5

df.loc[df["Energy Usage"]>52732.5,"Energy Usage"]=52732.5

df.loc[df["GDP"]>248573930319.5,"GDP"]=248573930319.5

df.loc[df["Health Exp % GDP"]>0.122,"Health Exp % GDP"]=0.122

df.loc[df["Health Exp/Capita"]>1332.0,"Health Exp/Capita"]=1332.0

df.loc[df["Hours to do Tax"]>324.0,"Hours to do Tax"]=324.0
df.loc[df["Hours to do Tax"]<164.0,"Hours to do Tax"]=164.0

df.loc[df["Infant Mortality Rate"]>0.11649999999999999,"Infant Mortality Rate"]=0.11649999999999999

df.loc[df["Lending Interest"]>0.23725,"Lending Interest"]=0.23725

df.loc[df["Life Expectancy Female"]<41.5,"Life Expectancy Female"]=41.5

df.loc[df["Life Expectancy Male"]<41.75,"Life Expectancy Male"]=41.75

df.loc[df["Mobile Phone Usage"]>2.1,"Mobile Phone Usage"]=2.1

df.loc[df["Population 15-64"]>0.846000000000000,"Population 15-64"]=0.846000000000000

df.loc[df["Population 65+"]>0.22399999999999998,"Population 65+"]=0.22399999999999998

df.loc[df["Population Total"]>51680541.25,"Population Total"]=51680541.25

df.loc[df["Tourism Inbound"]>7183500000.0,"Tourism Inbound"]=7183500000.0

df.loc[df["Tourism Outbound"]>4514500000.0,"Tourism Outbound"]=4514500000.0


In [None]:
df
Data=df.copy()
df

In [None]:
df.describe()

In [None]:

sns.boxplot(df,palette="coolwarm")
plt.tight_layout()

In [None]:
for column in df.select_dtypes(include=np.number).columns:
  sns.boxplot(y=df[column])
  plt.title(f"Boxplot of {column}")
  plt.tight_layout()
  plt.show()

In [None]:
df.hist(color="yellow",edgecolor="black")
plt.tight_layout()

In [None]:

plt.subplot(3, 4, 1)
sns.distplot(df["Birth Rate"])
plt.tight_layout()

plt.subplot(3, 4, 2)
sns.distplot(df["Business Tax Rate"])
plt.tight_layout()

plt.subplot(3, 4, 3)
sns.distplot(df["CO2 Emissions"])
plt.tight_layout()

plt.subplot(3, 4, 4)
sns.distplot(df["Days to Start Business"])
plt.tight_layout()

plt.subplot(3, 4, 5)
sns.distplot(df["Energy Usage"])
plt.tight_layout()

plt.subplot(3, 4, 6)
sns.distplot(df["GDP"])
plt.tight_layout()

plt.subplot(3, 4, 7)
sns.distplot(df["Health Exp % GDP"])
plt.tight_layout()

plt.subplot(3, 4, 8)
sns.distplot(df["Health Exp/Capita"])
plt.tight_layout()

plt.subplot(3, 4, 9)
sns.distplot(df["Hours to do Tax"])
plt.tight_layout()

plt.subplot(3, 4, 10)
sns.distplot(df["Infant Mortality Rate"])
plt.tight_layout()

plt.subplot(3, 4, 11)
sns.distplot(df["Internet Usage"])
plt.tight_layout()

In [None]:
sns.countplot(x='Country', data=df,palette="coolwarm")
plt.title('Countries')
plt.xlabel('Country')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x='Life Expectancy Female', data=df,palette="viridis")
plt.title('Life Expectency of Females')
plt.xlabel('AGE')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x='Life Expectancy Male', data=df,palette="rainbow")
plt.title('Life Expectency of Males')
plt.xlabel('AGE')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df)
plt.tight_layout()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
columns = [
    'GDP',
    'Birth Rate',
    'Business Tax Rate',
    'CO2 Emissions',
    'Energy Usage',
    'Internet Usage',
    'Lending Interest',
    'Mobile Phone Usage'
]
plt.figure(figsize=(15, 30))
for i, column in enumerate(columns):
    plt.subplot(len(columns), 1, i + 1)
    grouped_data = df.groupby('Country')[column].sum()
    top5 = grouped_data.nlargest(5)
    plt.barh(top5.index, top5.values, color='skyblue', edgecolor='black')
    plt.title(f'Top 5 Countries by {column}', fontsize=16)
    plt.xlabel(column, fontsize=12)
    plt.ylabel('Country', fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
category_counts = df['Birth Rate'].value_counts()
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Birth Rate')
plt.tight_layout()
plt.show()

In [None]:
sns.lineplot(x=df.index,y=df["Population Urban"],palette="vividis")
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,8),facecolor="blue",edgecolor="red")
sns.scatterplot(x="Population Total",y='Lending Interest',hue="Days to Start Business",data=df,palette="rainbow")
plt.tight_layout()

In [None]:
df["Infant Mortality Rate"].plot(kind="line")

In [None]:
plt.figure(figsize=(10,8),facecolor="teal",edgecolor="red")
sns.scatterplot(x="CO2 Emissions",y='Energy Usage',hue="Birth Rate",data=df,palette="rainbow")
plt.tight_layout()

In [None]:
sns.swarmplot(x="Health Exp % GDP",y="Hours to do Tax",data=df,size=6,hue="Internet Usage")
plt.tight_layout()

In [None]:
df.corr(numeric_only=True)

In [None]:
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap="viridis")
plt.tight_layout()

#### **Positive Correlations:**
1. **Life Expectancy Female ↔ Life Expectancy Male (0.98):** These two are nearly perfectly correlated, meaning when life expectancy for females increases, it also increases for males.
2. **Population Total ↔ Population Urban (0.93):** A strong correlation shows that as total population increases, urban population also grows significantly.
3. **GDP ↔ Energy Usage (0.74):** A positive correlation suggests that countries with higher GDPs consume more energy.
4. **Health Exp/Capita ↔ GDP (0.61):** Wealthier countries (higher GDP) tend to spend more on health per capita, showcasing a direct relationship between economic performance and investment in health.
#### **Negative Correlations:**
1. **Birth Rate ↔ Life Expectancy Female (-0.87):** Countries with higher birth rates have lower female life expectancy.
2. **Infant Mortality Rate ↔ Life Expectancy Male (-0.82):** High infant mortality reflects poorer health outcomes.
3. **Population 0-14 ↔ Population 15-64 (-0.77):** A significant negative correlation means that a higher proportion of young people often corresponds to a smaller working-age population.
4. **Tourism Inbound ↔ Tourism Outbound (-0.54):** This suggests that countries with more inbound tourism (foreign visitors) tend to have fewer outbound tourists (local citizens traveling abroad), and vice versa.

#### **Moderate Correlations:**
1. **Mobile Phone Usage ↔ Internet Usage (0.53):** A moderate positive correlation shows that higher mobile phone usage generally coincides with increased internet penetration.
2. **Lending Interest ↔ GDP (-0.39):** As lending interest rates rise, GDP tends to drop slightly.

In [None]:
# standardization
Sc=StandardScaler()
df1=Sc.fit_transform(df)
df=pd.DataFrame(df1,columns=df.columns)
df


In [None]:
df.describe()

# **Feature Engineering**

## **PCA**

In [None]:
pca=PCA()
pca_components=pca.fit_transform(df)

In [None]:
pca_components

In [None]:
pca.explained_variance_

In [None]:
var=pca.explained_variance_ratio_
var

In [None]:
var1=np.cumsum(np.round(var,4)*100)
var1

In [None]:
pca = PCA(n_components=0.97)  # Retain 97% of the variance
principal_components = pca.fit_transform(df)

In [None]:
pca_components

In [None]:
pca.explained_variance_

In [None]:
var=pca.explained_variance_ratio_
var

In [None]:
var1=np.cumsum(np.round(var,4)*100)
var1

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--', color='b')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

In [None]:
# Create a DataFrame of the principal components
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(principal_components.shape[1])])


In [None]:
pca_df

In [None]:
explained_variance = pca.explained_variance_ratio_
print(f'Explained Variance Ratio by Component: {explained_variance}')

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'])
plt.title('Data Visualization on First Two Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
sns.pairplot(pca_df,diag_kind="kde",palette="viridis")

## **t-SNE**

In [None]:
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
tsne_results = tsne.fit_transform(df)
tsne_results
#n_components=2: Reduce the data to 2D.
#perplexity=30: Defines how to balance local vs. global structure in the data.
#n_iter=1000: Maximum iterations for t-SNE optimization.
#random_state=42: For reproducibility

In [None]:
tsne_df = pd.DataFrame(data=tsne_results, columns=['Dim1', 'Dim2'])
tsne_df

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=tsne_df['Dim1'], y=tsne_df['Dim2'], palette='viridis', s=60)
plt.title('t-SNE Visualization of Data')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)
plt.show()

In [None]:
sns.pairplot(tsne_df,plot_kws={'color': 'teal'})

# **Model Building**

### **Hirarchical Clustering**

In [None]:
plt.figure(figsize=(12, 8))
dendrogram = sch.dendrogram(sch.linkage(pca_df, method='complete'))
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Data Points')
plt.ylabel('Euclidean Distance')
plt.grid(True)
plt.show()

In [None]:
# create clusters, suppose got input from customer that go for 4 clustes
hc=AgglomerativeClustering(n_clusters=4,metric="euclidean",linkage="complete")
hc

In [None]:
y_hc=hc.fit_predict(pca_df)
y_hc

In [None]:
clusters1=pd.DataFrame(y_hc,columns=["H_Clusters"])
clusters1

In [None]:
plt.figure(figsize=(8, 6))
pca_df['Cluster'] = clusters1['H_Clusters']  # Adding cluster information to pca_df
for cluster in np.unique(clusters1):
    plt.scatter(pca_df[pca_df['Cluster'] == cluster]['PC1'],
                pca_df[pca_df['Cluster'] == cluster]['PC2'],
                label=f'Cluster {cluster + 1}')
plt.title('Clusters Visualized Using First 2 Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

In [None]:


plt.figure(figsize=(8, 6))
pca_df['Cluster'] = clusters1['H_Clusters']
for cluster in np.unique(clusters1):
    plt.scatter(pca_df[pca_df['Cluster'] == cluster]['PC3'],
                pca_df[pca_df['Cluster'] == cluster]['PC4'],
                label=f'Cluster {cluster + 1}')
plt.title('Clusters Visualized Using First 2 Principal Components')
plt.xlabel('Principal Component 3')
plt.ylabel('Principal Component 4')
plt.legend()
plt.show()

In [None]:
sns.pairplot(pca_df,hue="Cluster",palette="rainbow")

In [None]:
Data
h_data=Data.copy()
h_data

In [None]:
h_data["H_Clusters"]=clusters1
h_data

In [None]:
h_data[h_data["H_Clusters"]==0]

In [None]:
h_data[h_data["H_Clusters"]==1]

In [None]:
h_data[h_data["H_Clusters"]==2]

In [None]:
h_data[h_data["H_Clusters"]==3]

### **K-means Clustering**

In [None]:
WCSS=[] # initialize WCS and begin the loop
for i in range(1,11):
  kmeans=KMeans(n_clusters=i,init="k-means++",max_iter=300,random_state=0)#max_iter=300 is default value
  kmeans.fit(pca_df)# use fit method to fit the kmeans object to our scaled dataframe
  WCSS.append(kmeans.inertia_)# another name for wcss is inertia

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(range(1,11), WCSS, marker='o', linestyle='--', color='r')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

In [None]:
kmeans=KMeans(n_clusters=4,random_state=0) # pass no of clusters as 4
y_K=kmeans.fit_predict(pca_df)
y_K

In [None]:
clusters2=pd.DataFrame(y_K,columns=["K_Clusters"])
clusters2

In [None]:
plt.figure(figsize=(8, 6))
pca_df['Cluster'] = clusters2['K_Clusters']
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=pca_df['Cluster'], palette='viridis', s=60)
plt.title('Clusters Visualized Using First 2 Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

In [None]:
sns.pairplot(pca_df,hue="Cluster",palette="viridis")

In [None]:
Data
K_data=Data.copy()
K_data



In [None]:
K_data["K_Clusters"]=clusters2
K_data

In [None]:
K_data[K_data["K_Clusters"]==0]

In [None]:
K_data[K_data["K_Clusters"]==1]

In [None]:
K_data[K_data["K_Clusters"]==2]

In [None]:
K_data[K_data["K_Clusters"]==3]

### **DBSCAN Clustering**

In [None]:
db=DBSCAN(eps=0.5,min_samples=3)# default eps=0.5
y_db=db.fit_predict(pca_df)
y_db

In [None]:
plt.scatter(pca_df.iloc[:,3],pca_df.iloc[:,4],c=y_db,cmap="rainbow")

In [None]:
# Find the k-distance (k = min_samples)
k = 5
neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(pca_df)
distances, indices = neighbors_fit.kneighbors(pca_df)

In [None]:
distances = np.sort(distances, axis=0)
plt.figure(figsize=(6, 4))
plt.plot(distances)
plt.axhline(y=3,linestyle="--",color="red")
plt.title('K-Distance Graph')
plt.xlabel('Data Points (sorted)')
plt.ylabel(f'{k}-th Nearest Neighbor Distance')
plt.show()

In [None]:
db=DBSCAN(eps=3,min_samples=10)
y_db=db.fit_predict(pca_df)
y_db

In [None]:
plt.figure(figsize=(5,4))
plt.scatter(pca_df.iloc[:,3],pca_df.iloc[:,4],c=y_db,cmap="rainbow")
plt.tight_layout()

In [None]:
clusters3=pd.DataFrame(y_db,columns=["DB_Clusters"])
clusters3

In [None]:
Data
db_data=Data.copy()
db_data

In [None]:
db_data["DB_Clusters"]=clusters3
db_data

In [None]:
db_data[db_data["DB_Clusters"]==0]

In [None]:
db_data[db_data["DB_Clusters"]==-1]

## **Model Evaluation**

In [None]:

silhouette_avg_hc = silhouette_score(pca_df, y_hc)
print(f"Silhouette Score (Hierarchical Clustering): {silhouette_avg_hc}")

silhouette_avg_kmeans = silhouette_score(pca_df, y_K)
print(f"Silhouette Score (K-means Clustering): {silhouette_avg_kmeans}")

silhouette_avg_dbscan = silhouette_score(pca_df, y_db)
print(f"Silhouette Score (DBSCAN Clustering): {silhouette_avg_dbscan}")

In [None]:
import pickle

In [None]:
pickle.dump(kmeans,open("kmeans.pkl","wb"))