In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('/kaggle/input/judicial-expenditures-across-all-50-states/jeee16t08.csv',index_col=0)

In [None]:
df.head()

In [None]:
df.drop(df.index[0],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.columns = ['Population_2016','Total_justice_system_PC','Police_Protection_PC','Judicial_and_legal_PC','Corrections_PC','Total_justice_system_Employment','police_protection_Total_Employment','police_protection_Sworn_only_Employment','Judicial_and_legal_Employment','Corrections_Employment']

In [None]:
df.head()

In [None]:
sns.scatterplot(x='Total_justice_system_PC',y='Population_2016',data=df)

### Data Preprocessing

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [None]:
pca = PCA()
pca.fit(df_scaled)

In [None]:
pca.explained_variance_ratio_.round(3)

In [None]:
plt.figure(figsize=(12,10))
plt.plot(range(1,11),pca.explained_variance_ratio_.cumsum(),marker='o',linestyle='--')
plt.title("Explained Variance by components")
plt.xlabel("Number of components")
plt.ylabel("cumulative explained variance")
plt.show()

The graph shows the amount of variance captured (on the y-axis) depending on the number of components we include (the x-axis). A rule of thumb is to preserve around 80 % of the variance. So, in this instance, we decide to keep 3 components.

For our data set, that means 3 principal components:

In [None]:
pca = PCA(n_components=3)

In [None]:
pca.fit(df_scaled)

In [None]:
pca.transform(df_scaled)

In [None]:
scores_pca = pca.transform(df_scaled)

We’ll incorporate the newly obtained PCA scores in the K-means algorithm. That’s how we can perform segmentation based on principal components scores instead of the original features.

#### Determine no of clusters for K means

we run the algorithm with a different number of clusters. Then, we determine the Within Cluster Sum of Squares or WCSS for each solution. Based on the values of the WCSS and an approach known as the Elbow method, we make a decision about how many clusters we’d like to keep.

#### K Means Clustering using PCA

In [None]:
wcss = []
for i in range(1,21):
    kmeans_pca = KMeans(n_clusters=i,init='k-means++',random_state=42)
    kmeans_pca.fit(scores_pca)
    wcss.append(kmeans_pca.inertia_)

In [None]:
plt.figure(figsize=(12,10))
plt.plot(range(1,21),wcss,marker='o',linestyle='--')
plt.xlabel('no of clusters')
plt.ylabel('WCSS')
plt.title('Kmeans with PCA Clustering')
plt.show()

And from this graph, we determine the number of clusters we’d like to keep. To that effect, we use the Elbow-method. The approach consists of looking for a kink or elbow in the WCSS graph. Usually, the part of the graph before the elbow would be steeply declining, while the part after it – much smoother. In this instance, the kink comes at the 4 clusters mark. So, we’ll be keeping a four-cluster solution.

In [None]:
# no of clusters =4
kmeans_pca = KMeans(n_clusters=4,init='k-means++',random_state=42)

In [None]:
kmeans_pca.fit(scores_pca)

### KMeans Clustering with PCA results

In [None]:
#We create new dataframe with original Features and PCA scores and assigned clusters
df_pca_seg_Kmeans = pd.concat([df.reset_index(drop=True),pd.DataFrame(scores_pca)],axis=1)

In [None]:
df_pca_seg_Kmeans.columns.values[-3:] = ['component_1','component_2','component_3']
df_pca_seg_Kmeans['Segment_KMeans_PCA'] =kmeans_pca.labels_ 

In [None]:
df_pca_seg_Kmeans['Segment'] = df_pca_seg_Kmeans['Segment_KMeans_PCA'].map({0:'First',1:'Second',2:'Third',3:'Fourth'})

### Visualizing the components

In [None]:
x_axis = df_pca_seg_Kmeans['component_2']
y_axis = df_pca_seg_Kmeans['component_1']

plt.figure(figsize=(12,8))
sns.scatterplot(x_axis,y_axis,hue=df_pca_seg_Kmeans['Segment'],palette=['g','r','c','m'])
plt.title('Clusters by PCA Components')
plt.show()

when we employ PCA prior to using K-means we can visually separate almost the entire data set. That was one of the biggest goals of PCA – to reduce the number of variables by combining them into bigger, more meaningful features.


In [None]:
df1 = df.copy()
df1.reset_index(inplace=True)

In [None]:
df_pca_seg_Kmeans['State'] = df1['State']

In [None]:
df_pca_seg_Kmeans[df_pca_seg_Kmeans['Segment']=='First']['State']

In [None]:
df_pca_seg_Kmeans[df_pca_seg_Kmeans['Segment']=='Second']['State']

In [None]:
df_pca_seg_Kmeans[df_pca_seg_Kmeans['Segment']=='Third']['State']

In [None]:
df_pca_seg_Kmeans[df_pca_seg_Kmeans['Segment']=='Fourth']['State']

In [None]:
sns.scatterplot(x='Total_justice_system_PC',y='Population_2016',hue='Segment',data=df_pca_seg_Kmeans)