# Imporing libraries 

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set_style('darkgrid')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
pd.set_option('display.max_rows', 100)

# Helper methods 


In [9]:
from pycaret.clustering import *
def cluster_impute(df):
    list_of_headers = list(df.columns)
    df = df.replace({'0':np.nan, 0:np.nan})
    Clustering = setup(data = df , numeric_features = list_of_headers , normalize = True )
    kmeans = create_model('hclust' , num_clusters=5)
    kmeans_df = assign_model(kmeans)
    for i in list_of_headers:
        kmeans_df[i] = kmeans_df[i].fillna(kmeans_df.groupby('Cluster')[i].transform('mean'))
    return kmeans_df.iloc[: , :-1]

In [10]:

def mean_impute(df):
    np1 = df.iloc[:,:]
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    np1 = imp_mean.fit_transform(np1)
    df1 = pd.DataFrame(data = np1)
    df1.columns = df.columns
    return df1

In [11]:
def top_3(pca , df):
    comps = abs( pca_obj.components_ )
    comps = np.argsort(comps)
    list_of_columns = [df.columns[comps[0 , -1]] ,df[comps[: , -2]].columns,df[comps[: , -3]].columns]
    return list_of_columns

In [12]:

def minmax_scaler(df):
    scaler = MinMaxScaler()
    np1 = df.iloc[:,:]
    np1 = scaler.fit_transform(np1)
    df1 = pd.DataFrame(data = np1)
    df1.index = df.index
    df1.columns = df.columns
    return df1

In [13]:

def reduce_to_three(df):
    
    np1 = df.iloc[:,:]
    pca_obj = PCA(n_components=3)
    np1 = pca_obj.fit_transform(np1)
    #print(top_3(pca_obj ,df_tech ))
    #df_citizen_pca = pca_obj.fit_transform(df_citizen)
    #print(top_3(pca_obj ,df_citizen ))
    var = pca_obj.explained_variance_ratio_
    comps = abs( pca_obj.components_ )
    comps = np.argsort(comps)
    one = comps[0 , -1]
    two = comps[1 , -1]
    three = comps[2 , -1]
    list_of_column_headers = []
    
    df1 = pd.DataFrame(data = np1)
    df1.index = df.index
    list_of_column_headers.append(df.columns[one])
    list_of_column_headers.append(df.columns[two])
    list_of_column_headers.append(df.columns[three])
    j = 0
    for i in list_of_column_headers:
        df1.rename(columns={j: str(i)} , inplace = True)
        j+=1
    return df1   

# loading data

In [14]:
df_tech = pd.read_csv("Copy of ECIU data team E - archi_tech.csv" , index_col = "code_city")
df_citizen = pd.read_csv("Copy of ECIU data team E - citizen.csv" ,index_col = "code_city")
df_economy = pd.read_csv("ECIU data team E - economy.csv",index_col = "code_city")
df_economy = df_economy.iloc[: , :-7]
df_envieronment = pd.read_csv("ECIU data team E - environment.csv",index_col = "code_city")
df_envieronment = df_envieronment.iloc[: , :-7]
df_government = pd.read_csv("Copy of ECIU data team E - government.csv",index_col = "code_city")
df_living = pd.read_csv("Copy of ECIU data team E - living.csv",index_col = "code_city")
df_mobility = pd.read_csv("Copy of ECIU data team E - mobility.csv",index_col = "code_city")
df_pop = pd.read_csv("Copy of ECIU data team E - Sheet1.csv",index_col = "code_city")

In [8]:
df_tech  = cluster_impute(df_tech)
df_citizen =cluster_impute(df_citizen)
df_citizen = mean_impute(df_citizen)
df_citizen.index = df_tech.index
df_economy = cluster_impute(df_economy)
df_envieronment = cluster_impute(df_envieronment)
df_envieronment = mean_impute(df_envieronment)
df_envieronment.index = df_tech.index
df_government =cluster_impute(df_government)
df_living =cluster_impute(df_living)
df_mobility = cluster_impute(df_mobility)
df_mobility = mean_impute(df_mobility)
df_mobility.index = df_tech.index
df_pop = cluster_impute(df_pop)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2853,16.7591,0.9883,0,0,0


In [11]:
df_tech_pca  = minmax_scaler(df_tech)
df_citizen_pca =minmax_scaler(df_citizen)
df_economy_pca = minmax_scaler(df_economy)
df_envieronment_pca = minmax_scaler(df_envieronment)
df_government_pca =minmax_scaler(df_government)
df_living_pca =minmax_scaler(df_living)
df_mobility_pca = minmax_scaler(df_mobility)
df_pop_pca = minmax_scaler(df_pop)

In [12]:
df_tech_pca  = reduce_to_three(df_tech_pca)
df_citizen_pca =reduce_to_three(df_citizen_pca)
df_economy_pca = reduce_to_three(df_economy_pca)
df_envieronment_pca = reduce_to_three(df_envieronment_pca)
df_government_pca =reduce_to_three(df_government_pca)
df_living_pca =reduce_to_three(df_living_pca)
df_mobility_pca = reduce_to_three(df_mobility_pca)
df_pop_pca = reduce_to_three(df_pop_pca)
df_pop_pca = df_pop_pca.iloc[: , :-1]

In [13]:
full_df_nopca = pd.concat([df_tech , df_citizen  ,df_economy,df_envieronment, df_government ,df_living,df_mobility ,df_pop] , axis = 1)
full_df_pca = pd.concat([df_tech_pca , df_citizen_pca  ,df_economy_pca,df_envieronment_pca , df_government_pca ,df_living_pca,df_mobility_pca,df_pop_pca ] , axis = 1)
full_df = pd.concat([df_tech , df_citizen  ,df_economy,df_envieronment, df_government ,df_living,df_mobility ,df_pop] , axis = 1)

# Correlation analysis 

In [14]:
true_and_false = full_df_nopca.corr()>0.8

In [15]:
true_and_false[['online_purchase' , 'access', 'consult_voting' ,'submit_forms' ]]


Unnamed: 0,online_purchase,access,consult_voting,submit_forms
access,False,True,False,False
ent_broadband,False,False,False,False
power,False,False,False,False
communication,False,False,False,False
electronics,False,False,False,False
...,...,...,...,...
life_expectancy,False,False,False,False
gdp_city,False,False,False,False
pop_total,False,False,False,False
urban_growth,False,False,False,False


# Clustering 

In [17]:
from pycaret.clustering import *
exp_name = setup(data = full_df_pca)
knn_pca = create_model('kmeans' , num_clusters =3,round = 15 )
knn_df_pca = assign_model(knn_pca)
evaluate_model(knn_pca)


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.243696,12.980034,1.39953,0,0,0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [19]:
pca_df = full_df_nopca[list(full_df_pca.columns)]
pca_df['Cluster'] = knn_df_pca.Cluster
full_df['Cluster'] = knn_df_pca.Cluster

In [21]:
kun_clus = pca_df[full_df_nopca.index == 'kau'].Cluster
class_df_pca = pca_df.copy()
class_df_pca.Cluster[class_df_pca.Cluster != kun_clus.values[0]] = 'Cluster 1'

In [23]:
mean_all_pca = pca_df.groupby('Cluster').mean()


# Supervised learning 

In [32]:
list_of_headers = list(class_df_pca.columns)
list_of_headers = list_of_headers[:-1]
from pycaret.classification import *
exp_name = setup(data = class_df_pca,  target = 'Cluster', train_size = 0.9,numeric_features = list_of_headers,normalize = True ,normalize_method = 'minmax' , feature_selection = True)
model = create_model('rf')
evaluate_model(model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [106]:
countries_labeled = countries_labeled[['City']]

In [107]:
countries_labeled['Cluster'] = knn_df_pca['Cluster'].values

In [111]:
most_important_features = ['online_purchase' , 'access', 'consult_voting' ,'submit_forms' , 'ent_onlineorders','cost_living_local' ,'pop_total','disused','public_transport' ,'metro_length']
df_most_important = pca_df[most_important_features]
pca_df_means = pca_df.groupby('Cluster').mean()
df_most_important['Cluster'] = pca_df.Cluster
df_important_mean = df_most_important.groupby('Cluster').mean()
#df_most_important1 = minmax_scaler(df_most_important.iloc[:,:-1])
#df_most_important1['Cluster'] = df_most_important.Cluster
#df_kaunas = df_most_important1

In [33]:
df_most_important.groupby('Cluster').mean()

NameError: name 'df_most_important' is not defined

In [124]:
full_df1 = full_df[['gdp_city' , 'pop_city' , 'density' , 'Cluster']].groupby('Cluster').mean()

# Core features of every cluster 

In [113]:
cluster0 = []
cluster1 = []
cluster2 = []
for i in pca_df_means.columns:
    index = pca_df_means[i].values.argmax()
    if index == 0 :
        cluster0.append(i)
    elif index ==  1:
        cluster1.append(i)
    else :
        cluster2.append(i)