# Course Project - Real-World Machine Learning Model

In [None]:
import pandas as pd
import numpy as np

## 0. Import Data

In [None]:
# Get data from Kaggle

drug_csv = pd.read_csv('../input/drug-classification/drug200.csv')
drug_csv.head()

# 1. Explore Data

In [None]:
#Load python librarys

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
drug_csv.info()

In [None]:
#Create summary counts for plots

cnt_sex = drug_csv.groupby(['Drug', 'Sex'])['Na_to_K'].count().unstack().reset_index().fillna(0)
cnt_bp = drug_csv.groupby(['Drug', 'BP'])['Na_to_K'].count().unstack().reset_index().fillna(0)
cnt_cholesterol = drug_csv.groupby(['Drug','Cholesterol'])['Na_to_K'].count().unstack().reset_index().fillna(0)
cnt_drug = drug_csv[['Drug','Na_to_K']].groupby('Drug').count().sort_values('Na_to_K')

#Set plot field
fig, ax = plt.subplots(1,4)

cnt_sex.plot(x='Drug', kind='bar', stacked=True, ax=ax[0])
ax[0].set_title('Sex')
ax[0].set_ylabel('')

cnt_bp.plot(x='Drug', kind='bar', stacked=True, ax=ax[1])
ax[1].set_title('BP')
ax[1].set_ylabel('')

cnt_cholesterol.plot(x='Drug', kind='bar', stacked=True, ax=ax[2])
ax[2].set_title('Cholesterol')
ax[2].set_ylabel('')

cnt_drug.plot(kind='barh', ax=ax[3] ,legend=None)
ax[3].set_title('Drug')
ax[3].set_ylabel('')




# set the spacing between subplots
plt.subplots_adjust(left=0.01,
                    bottom=0.05, 
                    right=2.5, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()

In [None]:
#Set plot field
fig, ax = plt.subplots(1,2)

#Create boxplots, no outliers
drug_csv.boxplot('Age', by = 'Drug', showfliers = False, showmeans = True, ax=ax[0])  
drug_csv.boxplot('Na_to_K', by = 'Drug', showfliers = False, showmeans = True, ax=ax[1])  

plt.xticks(rotation=90)
plt.grid(linestyle='dotted')
plt.suptitle('')

ax[0].set_title('Age')
ax[0].set_xlabel('')
ax[1].set_title('Na to K')
ax[1].set_xlabel('')

# set the spacing between subplots
plt.subplots_adjust(left=0.01,
                    bottom=0.05, 
                    right=2.5, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()

## 2. Prepare Data

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
#Recode categorical variables

drug_csv_copy = drug_csv.copy()

categorical_cols = drug_csv_copy.select_dtypes(include=object).columns
categorical_cols = categorical_cols.drop(['Drug']).tolist()

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(drug_csv_copy[categorical_cols].fillna('Unknown'))
encoded_cols = list(encoder.get_feature_names(categorical_cols))
drug_csv_copy[encoded_cols] = pd.DataFrame(encoder.transform(drug_csv_copy[categorical_cols].fillna('Unknown')))

In [None]:
#Define numeric columns
numeric_cols = drug_csv_copy.select_dtypes(include=np.number).columns
scaler = StandardScaler().fit(drug_csv_copy[numeric_cols])
drug_csv_copy[numeric_cols] = scaler.transform(drug_csv_copy[numeric_cols])


In [None]:
dataset = drug_csv_copy[numeric_cols].fillna(0)
dataset.head()

## 3. Cluster Analysis

In [None]:
from sklearn.cluster import KMeans

In [None]:
#Check correlation

corr = dataset.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
#Get optimal number of clusters

options = range(2,11)
inertias = []

for n_clusters in options:
    model = KMeans(n_clusters, random_state=42).fit(dataset)
    inertias.append(model.inertia_)
    
plt.title("No. of clusters vs. Inertia")
plt.plot(options, inertias, '-o')
plt.xlabel('No. of clusters (K)')
plt.ylabel('Inertia');

In [None]:
model = KMeans(n_clusters=5, random_state=42)
model.fit(dataset)
model.cluster_centers_

In [None]:
#Get predicted values

preds = model.predict(dataset)
preds

In [None]:
sns.scatterplot(data=dataset, x='Age', y='Na_to_K', hue=preds);
centers_x, centers_y = model.cluster_centers_[:,0], model.cluster_centers_[:,5]
plt.plot(centers_x, centers_y, 'xb')

It seems like our cluster groups doesn't differentiate by Age & Na to K

In [None]:
model.inertia_

## 4. Post Cluster Analysis

In [None]:
#Merge cluster predictions to original dataset

drug_pred = drug_csv.copy()
drug_pred['preds'] = pd.DataFrame(preds)
drug_pred.head()

In [None]:
#Check characteristics of each cluster

cnt_drug = drug_pred.groupby(['preds','Drug'])['Na_to_K'].count().unstack().reset_index().fillna(0)
cnt_sex = drug_pred.groupby(['preds', 'Sex'])['Na_to_K'].count().unstack().reset_index().fillna(0)
cnt_bp = drug_pred.groupby(['preds', 'BP'])['Na_to_K'].count().unstack().reset_index().fillna(0)
cnt_cholesterol = drug_pred.groupby(['preds','Cholesterol'])['Na_to_K'].count().unstack().reset_index().fillna(0)

#Set plot field
fig, ax = plt.subplots(1,4)

cnt_drug.plot(x='preds', kind='bar', stacked=True, ax=ax[0])
ax[0].set_title('Drug')
ax[0].set_ylabel('')

cnt_sex.plot(x='preds', kind='bar', stacked=True, ax=ax[1])
ax[1].set_title('Sex')
ax[1].set_ylabel('')

cnt_bp.plot(x='preds', kind='bar', stacked=True, ax=ax[2])
ax[2].set_title('BP')
ax[2].set_ylabel('')

cnt_cholesterol.plot(x='preds', kind='bar', stacked=True, ax=ax[3])
ax[3].set_title('Cholesterol')
ax[3].set_ylabel('')


# set the spacing between subplots
plt.subplots_adjust(left=0.01,
                    bottom=0.05, 
                    right=2.5, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()

In [None]:
#Set plot field
fig, ax = plt.subplots(1,2)

#Create boxplots, no outliers
drug_pred.boxplot('Age', by = 'preds', showfliers = False, showmeans = True, ax=ax[0])  
drug_pred.boxplot('Na_to_K', by = 'preds', showfliers = False, showmeans = True, ax=ax[1])  

plt.xticks(rotation=90)
plt.grid(linestyle='dotted')
plt.suptitle('')

ax[0].set_title('Age')
ax[0].set_xlabel('')
ax[1].set_title('Na to K')
ax[1].set_xlabel('')

# set the spacing between subplots
plt.subplots_adjust(left=0.01,
                    bottom=0.05, 
                    right=2.5, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()

## 5. Conclusion

It seems like the clusters formed are not differentiated by their drug assignment although there are definitely patterns among them.

For instance, Drug C are generally given to Cluster 1 & 2 who are (1) low BP, older ,males; (2) females w high cholesterol. Drug B & A are mostly in Cluster 4 who are older males w/ Normal BP. Drug Y is highest in Clusters 2 & 0 who are (0) young females w normal cholesterol w/ high Na to K. Drug X is given to clusters 0 to 3 but is highest in Cluster 3 who are males w  normal BP & Low Na to K