In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

ht_wt = pd.read_csv('Kaggle_Data/weight-height.csv')

ht_wt.head()

In [None]:
Kval = 5 # Number of clusters

ht_wt.isna().sum()


In [None]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [None]:
# correl = subset_data.corr()
correl = ht_wt.corr()

trace = go.Heatmap(z=correl.values,
                  x=correl.index.values,
                  y=correl.columns.values)
data=[trace]
plotly.offline.iplot(data, filename='basic-heatmap')

In [None]:
ht_wt.columns

In [None]:
female_data = ht_wt[ht_wt['Gender'] == 'Female']
male_data = ht_wt[ht_wt['Gender'] == 'Male']

In [None]:
cols = ['Height', 'Weight']
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
subset_f_data = pd.DataFrame(sc.fit_transform(female_data[cols]), columns = female_data[cols].columns, index = female_data.index)
subset_m_data = pd.DataFrame(sc.fit_transform(male_data[cols]), columns = male_data[cols].columns, index = male_data.index)

In [None]:
subset_f_data.head()
subset_m_data.head()

In [None]:
from sklearn.cluster import KMeans

km_f = KMeans(n_clusters = Kval, 
           init = 'random', 
           n_init = 10, 
           max_iter = 500, 
           tol = 1e-04, 
           random_state = 50)
data_km_f = pd.DataFrame(km_f.fit_predict(subset_f_data), index = subset_f_data.index)

km_m = KMeans(n_clusters = Kval, 
           init = 'random', 
           n_init = 10, 
           max_iter = 500, 
           tol = 1e-04, 
           random_state = 50)
data_km_m = pd.DataFrame(km_m.fit_predict(subset_m_data), index = subset_m_data.index)

In [None]:
data_km_f.head()
data_km_m.head()

In [None]:
data_km_f.rename(columns = {0:'cluster'}, inplace = True)
data_km_m.rename(columns = {0:'cluster'}, inplace = True)

data_km_f = data_km_f.merge(female_data[cols], left_index = True, right_index = True)
data_km_m = data_km_m.merge(male_data[cols], left_index = True, right_index = True)
data_km_f.head()
data_km_m.head()

In [None]:
for gen in ['f', 'm']:
    data_km = []
    plot_data = []
    if gen == 'f':
        data_km = data_km_f.copy()
    else:
        data_km = data_km_m.copy()
    for clus in set(data_km['cluster']):
        df = data_km[data_km['cluster'] == clus]
        plot_data.append(go.Scatter(x= df['Height'], y= df['Weight'], name = 'cluster ' + str(clus), mode = 'markers'))

    if gen == 'f':
        layout = go.Layout(xaxis = dict(title='Height'), yaxis = dict(title= 'Weight'), 
                   title = 'Clustering: Female')
    else:
        layout = go.Layout(xaxis = dict(title='Height'), yaxis = dict(title= 'Weight'), 
                   title = 'Clustering: Male')
    
    fig = go.Figure(data= plot_data, layout=layout)
    plotly.offline.iplot(fig)
    
    

# plt.plot(data_km['cluster'],data_km['rating'], 'ro', alpha = 0.5)
# for i in range(len(data_km)):
#     plt.text(data_km.loc[i]['cluster'], data_km.loc[i]['rating'], str(i)) # cereal_data['name']) # 

# plt.show()

In [None]:
distortions = []
for i in range(2, 11): 
    km = KMeans(n_clusters = i, 
           init = 'k-means++', # this initializes by placing the initial ones randomly and uniformly in grid
           n_init = 10, 
           max_iter = 500, 
           # tol = 1e-04, 
           random_state = 50)
    km.fit(subset_f_data)
    distortions.append(km.inertia_)
plt.plot(range(2,11), distortions, marker = 'o')
plt.xlabel('Number of Female Clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.show()

In [None]:
distortions = []
for i in range(2, 11): 
    km = KMeans(n_clusters = i, 
           init = 'k-means++', # this initializes by placing the initial ones randomly and uniformly in grid
           n_init = 10, 
           max_iter = 500, 
           # tol = 1e-04, 
           random_state = 50)
    km.fit(subset_m_data)
    distortions.append(km.inertia_)
plt.plot(range(2,11), distortions, marker = 'o')
plt.xlabel('Number of Male Clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.show()