# City Development Index

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Importing

In [None]:
df = pd.read_csv('../input/city-development-index-cdi-and-components/City Development Index.csv')
df.head()

In [None]:
df.info()

In [None]:
df.dropna(axis=1, inplace=True)
df=df.iloc[:,1:]

In [None]:
df.describe().T

In [None]:
df.head()

In [None]:
df.corr().sort_values('CDI')

## Data Visualization

In [None]:
plt.figure(figsize=[10,5])
sns.heatmap(df.corr(),annot=True);

In [None]:
plt.figure(figsize=[7,7])
df.Region.value_counts().plot(kind='pie', autopct='%.f%%');

In [None]:
plt.figure(figsize=[12,5])
df.boxplot();

In [None]:
sns.pairplot(df);

In [None]:
plt.figure(figsize=[16,12])
pnum=1
for i in df.columns:
    if pnum>=4:
        plt.subplot(3,3,pnum)
        sns.scatterplot(x=df[i],y='CDI',data=df)
        plt.xticks(size=12)
        plt.ylabel('CDI',fontsize=12)
        plt.title('Explanation of CDI by other variables', size=14)
    pnum+=1
    plt.tight_layout()

### Values by Regions

In [None]:
plt.figure(figsize=[16,12])
pnum=1
for i in df.columns:
    if pnum>=4:
        plt.subplot(3,3,pnum)
        sns.barplot(x='Region',y=i,data=df)
        plt.xticks(rotation=45,size=12)
        plt.ylabel(i,fontsize=12)
        plt.title('Values by Region',size=14)
    pnum+=1
    plt.tight_layout()

In [None]:
plt.figure(figsize=[16,12])
pnum=1
for i in df.columns:
    if pnum>=4:
        plt.subplot(3,3,pnum)
        sns.boxplot(x='Region',y=i,data=df)
        plt.xticks(rotation=45,size=12)
        plt.ylabel(i,fontsize=12)
        plt.title('Distribution of Values by Regions' ,size=14)
    pnum+=1
    plt.tight_layout()

### Ranking of Countries

In [None]:
plt.figure(figsize=[16,12])
pnum=1
for i in df.columns:
    if pnum>=4:
        plt.subplot(3,3,pnum)
        df.groupby('Country')[i].mean().sort_values(ascending=False)[:10].plot(kind='bar');        
        plt.xticks(rotation=45,size=10)
        plt.ylabel(i,fontsize=12)
        plt.title('Countries with the highest values' ,size=14)
    pnum+=1
    plt.tight_layout()

In [None]:
plt.figure(figsize=[16,12])
pnum=1
for i in df.columns:
    if pnum>=4:
        plt.subplot(3,3,pnum)
        df.groupby('Country')[i].mean().sort_values(ascending=False)[-10:].plot(kind='bar');        
        plt.xticks(rotation=45,size=10)
        plt.ylabel(i,fontsize=12)
        plt.title('Countries with the lowest values' ,size=14)
    pnum+=1
    plt.tight_layout()

### Ranking of Cities

In [None]:
plt.figure(figsize=[16,12])
pnum=1
for i in df.columns:
    if pnum>=4:
        plt.subplot(3,3,pnum)
        df.groupby('City')[i].mean().sort_values(ascending=False)[:10].plot(kind='bar');        
        plt.xticks(rotation=45,size=10)
        plt.ylabel(i,fontsize=12)
        plt.title('Cities with the highest values' ,size=14)
    pnum+=1
    plt.tight_layout()

In [None]:
plt.figure(figsize=[16,12])
pnum=1
for i in df.columns:
    if pnum>=4:
        plt.subplot(3,3,pnum)
        df.groupby('City')[i].mean().sort_values(ascending=False)[-10:].plot(kind='bar');        
        plt.xticks(rotation=45,size=10)
        plt.ylabel(i,fontsize=12)
        plt.title('Cities with the lowest values' ,size=14)
    pnum+=1
    plt.tight_layout()

## Model Selection

In [None]:
df=df._get_numeric_data()

In [None]:
X=df.drop('CDI', axis=1)
y=df['CDI']

In [None]:
!pip install xgboost

In [None]:
!pip install lightgbm

In [None]:
!pip install catboost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [None]:
RMSE = []
ModelName = []

def ML(Algorithm):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    model = Algorithm().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    ModelName.append(Algorithm.__name__)
    print(Algorithm.__name__,'RMSE score:',rmse)

In [None]:
models=[LinearRegression,
         MLPRegressor,
         DecisionTreeRegressor,
         KNeighborsRegressor,
         RandomForestRegressor,
         GradientBoostingRegressor,
         XGBRegressor,
         LGBMRegressor,
         CatBoostRegressor,
         SVR]

In [None]:
for i in models:
    ML(i)

In [None]:
ModelName

In [None]:
RMSE

In [None]:
model_visual=pd.DataFrame({'ModelName':ModelName,'RMSE':RMSE})

In [None]:
plt.figure(figsize=[10,5])
sns.barplot(x='RMSE',y='ModelName',data=model_visual.sort_values('RMSE'));

## Normalising the Data

In [None]:
df=df._get_numeric_data()

In [None]:
X=df.drop('CDI', axis=1)
y=df['CDI']

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)
X = pd.DataFrame(X, columns=['City Product','Infrastructure','Waste','Health','Education'])
X.head()

In [None]:
RMSE = []
ModelName = []

def ML(Algorithm):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    model = Algorithm().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    ModelName.append(Algorithm.__name__)
    print(Algorithm.__name__,'RMSE score:',rmse)

In [None]:
for i in models:
    ML(i)

In [None]:
RMSE

In [None]:
model_visual=pd.DataFrame({'ModelName':ModelName,'RMSE':RMSE})

In [None]:
plt.figure(figsize=[10,5])
sns.barplot(x='RMSE',y='ModelName',data=model_visual.sort_values('RMSE'));