In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
from sklearn.cluster import KMeans

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/usarrest/kmeans_usarrests/USArrests.csv")
df.index = df['Unnamed: 0']
df.head()

In [None]:
df = df.iloc[:, 1:5]
df.head()

In [None]:
df.index.name = None
df.head()

In [None]:
# No dublicated values
df.duplicated().sum()

In [None]:
# No missing values
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.index.value_counts()

In [None]:
# Murders for each States
plt.figure(figsize = (15,3))
g = sns.barplot(x = df.index, y = df['Murder'], data = df)
g.set_xticklabels(g.get_xticklabels(), rotation=90);

In [None]:
# Assaults for each States
plt.figure(figsize = (15,3))
g = sns.barplot(x = df.index, y = df['Assault'], data = df)
g.set_xticklabels(g.get_xticklabels(), rotation=90);

In [None]:
# Urban Populantions for each States
plt.figure(figsize = (15,3))
g = sns.barplot(x = df.index, y = df['UrbanPop'], data = df)
g.set_xticklabels(g.get_xticklabels(), rotation=90);

In [None]:
# Rapes for each States
plt.figure(figsize = (15,3))
g = sns.barplot(x = df.index, y = df['Rape'], data = df)
g.set_xticklabels(g.get_xticklabels(), rotation=90);

In [None]:
# Analyze distributions
fig, axs = plt.subplots(2,2, figsize = (10,10))
sns.distplot(df['Murder'], ax=axs[0,0])
sns.distplot(df['Assault'], color = 'r', ax=axs[0,1])
sns.distplot(df['UrbanPop'], color = 'g', ax=axs[1,0])
sns.distplot(df['Rape'], color = 'y', ax=axs[1,1])

In [None]:
# It seems that "Assault" and "Rape" dont distribut normally
# Lets apply Shapiro-Wilk test to confirm that 

In [None]:
# Shapiro Test
from scipy.stats import shapiro

In [None]:
# H0: Distribution is Normal
# H1: Distribution is not Normal

test_statistics, p_value = shapiro(df['Murder'])
print("Test Statistics: ", test_statistics)
print("P-value: ", p_value)

In [None]:
# p_value is not less than 0.05. So, we can say that distribution of "Murder" is normal  

In [None]:
# H0: Distribution is Normal
# H1: Distribution is not Normal

test_statistics, p_value = shapiro(df['Assault'])
print("Test Statistics: ", test_statistics)
print("P-value: ", p_value)

In [None]:
# p_value is less than 0.05. So, we can say that distribution of "Assault" is not normal  

In [None]:
# H0: Distribution is Normal
# H1: Distribution is not Normal

test_statistics, p_value = shapiro(df['UrbanPop'])
print("Test Statistics: ", test_statistics)
print("P-value: ", p_value)

In [None]:
# p_value is not less than 0.05. So, we can say that distribution of "UrbanPop" is normal  

In [None]:
# H0: Distribution is Normal
# H1: Distribution is not Normal

test_statistics, p_value = shapiro(df['Rape'])
print("Test Statistics: ", test_statistics)
print("P-value: ", p_value)

In [None]:
# p_value is less than 0.05. So, we can say that distribution of "Rape" is not normal  

In [None]:
# Boxplot
fig, axs = plt.subplots(2,2, figsize = (10,10))
sns.boxplot(df['Murder'], ax=axs[0,0])
sns.boxplot(df['Assault'], color = 'r', ax=axs[0,1])
sns.boxplot(df['UrbanPop'], color = 'g', ax=axs[1,0])
sns.boxplot(df['Rape'], color = 'y', ax=axs[1,1])

In [None]:
# Correlations between features
plt.figure(figsize = (10,7))
sns.heatmap(df.corr(), annot = True, vmin = -1, vmax = 1);

In [None]:
# Generally, we expect the correlations between the features to be high since the subject is crimes.
# In addition, we can see from this table that there is a strong positive correlation between "Assault" and "Murder"

### KMeans

### Find Optimum number of Clusters (Elbow) 

In [None]:
# We need to find optimum number of clusters (minimum similarities between several clusters and maximum similarities interclusters)

In [None]:
#!pip install yellowbrick

In [None]:
from yellowbrick.cluster import KElbowVisualizer

In [None]:
kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k = (2,15))
visualizer.fit(df)
visualizer.poof()

### Final Model

In [None]:
kmeans = KMeans(n_clusters = 5).fit(df)
print("Number of clusters: ", kmeans.n_clusters)
print("-"*70)
print("Cluster Centers: ", '\n', kmeans.cluster_centers_)

In [None]:
labels = kmeans.labels_
labels

In [None]:
# Visualization
plt.scatter(df.iloc[:,0], df.iloc[:,1], c = labels, s = 50, cmap = 'viridis')

centers = kmeans.cluster_centers_

plt.scatter(centers[:,0], centers[:,1], c = 'Blue', s = 200, marker = 'p')

In [None]:
# !pip install --upgrade matplotlib
# import mpl_toolkits
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# 3D visualization
fig = plt.figure()
ax = Axes3D(fig)
x = df.iloc[:,0]
y = df.iloc[:,1]
z = df.iloc[:,2]
ax.scatter(x, y, z, c = labels,  cmap = 'RdBu')
ax.scatter(centers[:,0], centers[:,1], centers[:,2], c ="#050505", marker="o", s = 500);

In [None]:
# Showing States and Clusters that States belong to, together 
clustered_df = pd.DataFrame({'States': df.index,
                             'Clusters': labels})
clustered_df.head()

In [None]:
# Showing Clusters respectively
clustered_df.sort_values(by = 'Clusters')