# Clustering and Fitting


### Student Name: Priya Reddy Vadde


#### Student ID: 21031697


# Exploring the data from World development indicators

In [None]:
## First we read the last 7 climate data into a pandas dataframe

import pandas as pd
climate_df = pd.read_csv('climate_data.csv')
climate_df.tail(7)

In [None]:
## We then make lable column headers for variables

climate_df = climate_df.rename(columns={"Country Name":"name","Country Code":"code","Energy use (kg of oil equivalent per capita) 2015":"energy_pc","GDP per capita, PPP (current international $) 2015":"gdp_pc" ,"CO2 per capita (ton CO2/cap) 2015":"co2_pc"})
climate_df.describe()

In [None]:
## visualising relationships and calculating r-squared values

energy = climate_df['energy_pc']
gdp = climate_df['gdp_pc']
co2 = climate_df['co2_pc']

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.pairplot(climate_df)

In [None]:
climate_df.corr()*climate_df.corr()

In [None]:
## Now we visualize the strongest relationship between energy and co2 

plt.scatter(climate_df['energy_pc'], climate_df['co2_pc'], color='red',label='Standardized', alpha=0.3)
plt.scatter(climate_df['energy_pc'], climate_df['co2_pc'], color='blue',label='Normalised', alpha=0.3)

#  clustering by dendrogram

In [None]:
## visualising hierarchical clustering with ward method using dendrogram - formatted with labels

dendro_df = pd.DataFrame({'labels': climate_df['name'], 'energy':climate_df['energy_pc'], 'gdp':climate_df['gdp_pc'], 'co2':climate_df['co2_pc']})

import scipy.cluster.hierarchy as shc
plt.figure(figsize=(50, 15))
plt.yticks(fontsize=15)
plt.title("Dendrogram visualising hierarchical clustering countries with respect to per capita carbon emissions, energy use and GDP", fontsize = 30)  
dend = shc.dendrogram(shc.linkage(dendro_df[['energy', 'gdp', 'co2']], method='ward'), labels=list(dendro_df['labels']), leaf_font_size=15)

In [None]:
## visualising distribution of samples for each variable

import seaborn as sns
sns.violinplot(y=climate_df["gdp_pc"], x=climate_df["energy_pc"])

In [None]:
## visualising distribution of samples for each variable

import seaborn as sns
sns.violinplot(y=climate_df["gdp_pc"], x=climate_df["co2_pc"])

In [None]:
## visualising distribution of samples for each variable

import seaborn as sns
sns.violinplot(y=climate_df["energy_pc"], x=climate_df["co2_pc"])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

In [None]:
xdata = ['energy_pc']
ydata = ['co2_pc']

In [None]:
#Recast xdata and ydata into numpy arrays so we can use their handy features
xdata = np.asarray(xdata)
ydata = np.asarray(ydata)
plt.plot(xdata, ydata, 'o')

### Now to fit a curve we need to stardardize the data

In [None]:

from sklearn import preprocessing

std_scale = preprocessing.StandardScaler().fit(climate_df[['energy_pc', 'gdp_pc', 'co2_pc']])
std = std_scale.transform(climate_df[['energy_pc', 'gdp_pc', 'co2_pc']])
climate_df_std = pd.DataFrame(data = std)

minmax_scale = preprocessing.MinMaxScaler().fit(climate_df[['energy_pc', 'gdp_pc', 'co2_pc']])
minmax = minmax_scale.transform(climate_df[['energy_pc', 'gdp_pc', 'co2_pc']])
climate_df_minmax = pd.DataFrame(data = minmax)

#### Now we print linear min max normalisation

In [None]:
print(climate_df_std)
print(type(climate_df_std))

In [None]:
## we have to rename preprocessed dataframe columns for consistency

climate_df_std = climate_df_std.rename(columns={0:"energy_pc",1:"gdp_pc",2:"co2_pc"})
climate_df_minmax = climate_df_minmax.rename(columns={0:"energy_pc",1:"gdp_pc",2:"co2_pc"})
climate_df_std.head(5)

In [None]:
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
cluster.fit_predict(climate_df_std)

##### Partitioning using KMeans with 3 clusters and plot

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, init='k-means++', n_init=100).fit(climate_df_std)
centroids = kmeans.cluster_centers_
print(centroids)

plt.scatter(climate_df_std['energy_pc'], climate_df_std['co2_pc'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)

#### Create simple model(s) fitting data sets with curve_fit

In [None]:
## Elbow method to optimise number of clusters
import warnings
warnings.filterwarnings("ignore")
inertias = []
ks = range(1,8)

for k in ks:
    model = KMeans(n_clusters=k, init='k-means++', n_init=50).fit(climate_df_std)
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters')
plt.ylabel('inertia value')
plt.title('Investigation of optimal n clusters with elbow method')