# Data Preprocessing

In [None]:
#import libraries
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import metrics
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

In [None]:
#set column names for dataset

column_names = ['ID', 'Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion', 'Openness', 'Agreeableness', 'Conscientiousness', 'Impulsiveness', 'Sensation_seeking', 'Alcohol', 'Amphetamine', 'Amyl_nitrite', 'Benzodiazepine', 'Caffeine', 'Cannabis', 'Chocolate', 'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal_highs', 'LSD', 'Methadone', 'Mushrooms', 'Nicotine', 'Semeron', 'VSA']

In [None]:
#set drug names and feature names in a list (to be referenced ahead)

dependent_features = ['Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion', 'Openness', 'Agreeableness', 'Conscientiousness', 'Impulsiveness', 'Sensation_seeking']
drug_names = ['Alcohol', 'Amphetamine', 'Amyl_nitrite', 'Benzodiazepine', 'Caffeine', 'Cannabis', 'Chocolate', 'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal_highs', 'LSD', 'Methadone', 'Mushrooms', 'Nicotine', 'Semeron', 'VSA']

In [None]:
#load dataset in dataframe

drug_data = pd.read_csv('/Users/shubhamkulkarni/shubham/Machine Learning Assignment/drug_consumption.data', header = None, names = column_names)
drug_data.head()

In [None]:
#check NaN in data (as per result none of the columns have NaN values)

print(drug_data.isna().sum())

In [None]:
#Check Unique values for each feature
drug_data.nunique()

In [None]:
#Using ID Column as index as it does not have any relation with the features

drug_data.set_index('ID', inplace = True)

In [None]:
# Label Encode columns into numeric data

for column in column_names:
    if column != 'ID':
        le = LabelEncoder()
        drug_data[column] = le.fit_transform(drug_data[column])
drug_data.head()

In [None]:
#Verify Data Consistency after Label Encoding
print(drug_data.isna().sum())

In [None]:
#Categorizing Participants as users and non-users on the basis of their Drug usage
#Assumptions: 
#1 - A person is a drug user if they used a drug in last decade, year, month, week or day

#0 - A person is a non-drug user if they never used a drug or used over a decade ago

def change(category):
    if ((category == 6) or (category == 5) or (category == 4) or (category == 2) or (category == 3)):
        category = 1
    elif ((category == 0) or (category == 1)):
        category = 0
    return category

In [None]:
# Updating the dataset to categorize users and non-users of drugs

for column in drug_names:
    drug_data[column] = drug_data[column].map(change)

In [None]:
#Check Dependent Columns get segregated only in 2 categories
drug_data.nunique()

In [None]:
#Plotting a correlation matrix between all the columns
corrmat = drug_data.corr()

plt.figure(figsize=(30,30))

sns.set(font_scale=1)
hm = sns.heatmap(corrmat,cmap = 'coolwarm',annot=True, yticklabels = drug_data.columns, xticklabels = drug_data.columns)
plt.xticks(fontsize=13,rotation=50)
plt.yticks(fontsize=13)
plt.title("Correlation Between Different Features",fontsize=20)
plt.show()

In [None]:
#find total users and non users for each drug
total_data = len(drug_data.index)
for column in drug_names:
    print("Total", column,"users =", drug_data[column].sum(), ", % of total = {:.2f}".format(drug_data[column].sum()*100/total_data))
    
#Validated Against Research Paper 1    

I am choosing the following 6 druds to work on my models considering the variations in data available about the users and non users
1. Alcohol, 2. Amphetamine, 3. Benzodiazepine, 4. Cannabis, 5. Ecstasy, 6. Nicotine

In [None]:
#Creating a new Dataset of just independent variables (to be used for building models on each of the selected drugs) 
feature_dataset = drug_data[dependent_features]
len(feature_dataset.index)

# Creating 4 Models for Alcohol User Classification

In [None]:
#Load Data in X and Y Dataframes
X_alcohol = feature_dataset
Y_alcohol = drug_data[["Alcohol"]]

#Perform Train-Test Split
X_alcohol_train, X_alcohol_test, y_alcohol_train, y_alcohol_test = train_test_split(X_alcohol , Y_alcohol, test_size = 0.33, random_state = 42)
print("Training split input- ", X_alcohol_train.shape)
print("Testing split input- ", X_alcohol_test.shape)

In [None]:
#Build Decision Tree Model
alcohol_DT_classifier=DecisionTreeClassifier()
alcohol_DT_classifier.fit(X_alcohol_train,y_alcohol_train)

# Predicting the values of test data
y_alcohol_DT_pred = alcohol_DT_classifier.predict(X_alcohol_test)
print("Classification report - \n", classification_report(y_alcohol_test,y_alcohol_DT_pred))

In [None]:
#Visualizing Tree 
plt.figure(figsize = (250,250))
dec_tree = plot_tree(decision_tree=alcohol_DT_classifier, feature_names = X_alcohol.columns, 
                     class_names =["0", "1"] , filled = True , precision = 4, rounded = True)
plt.savefig("Decision Tree Alcohol.png")

In [None]:
#Build KNN Model
alcohol_KNN_classifier = KNeighborsClassifier()
alcohol_KNN_classifier.fit(X_alcohol_train, y_alcohol_train.values.ravel())
y_alcohol_KNN_pred = alcohol_KNN_classifier.predict(X_alcohol_test)
print("Classification report - \n", classification_report(y_alcohol_test,y_alcohol_KNN_pred))

In [None]:
#Build RF Model
alcohol_RF_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
alcohol_RF_classifier.fit(X_alcohol_train,y_alcohol_train.values.ravel())

y_alcohol_RF_pred = alcohol_RF_classifier.predict(X_alcohol_test)
print("Classification report - \n", classification_report(y_alcohol_test,y_alcohol_RF_pred))

In [None]:
#Build SVM Model
alcohol_SVM_classifier = SVC(kernel = 'rbf', gamma='auto', C=2)
alcohol_SVM_classifier.fit(X_alcohol_train, y_alcohol_train.values.ravel())
y_alcohol_SVM_pred = alcohol_SVM_classifier.predict(X_alcohol_test)
print("Classification report - \n", classification_report(y_alcohol_test,y_alcohol_SVM_pred))


In [None]:
#Plotting Confusion matrix for all models
fig, ax =plt.subplots(2,2,figsize=(15,10))

fig.suptitle('Alcohol User Confusion Matrix', fontsize=30)

alcohol_DT_cm = confusion_matrix(y_alcohol_test, y_alcohol_DT_pred)
sns.heatmap(data=alcohol_DT_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][0])
all_sample_title = 'DT Accuracy Score: {0:.4f}'.format(alcohol_DT_classifier.score(X_alcohol_test, y_alcohol_test))
ax[0][0].set_title(all_sample_title, size = 15)

alcohol_KNN_cm = confusion_matrix(y_alcohol_test, y_alcohol_KNN_pred)
sns.heatmap(data=alcohol_KNN_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][1])
all_sample_title = 'KNN Accuracy Score: {0:.4f}'.format(alcohol_KNN_classifier.score(X_alcohol_test, y_alcohol_test))
ax[0][1].set_title(all_sample_title, size = 15)

alcohol_RF_cm = confusion_matrix(y_alcohol_test, y_alcohol_RF_pred)
sns.heatmap(data=alcohol_RF_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][0])
all_sample_title = 'RF Accuracy Score: {0:.4f}'.format(alcohol_RF_classifier.score(X_alcohol_test, y_alcohol_test))
ax[1][0].set_title(all_sample_title, size = 15)

alcohol_SVM_cm = confusion_matrix(y_alcohol_test, y_alcohol_SVM_pred)
sns.heatmap(data=alcohol_SVM_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][1])
all_sample_title = 'SVM Accuracy Score: {0:.4f}'.format(alcohol_SVM_classifier.score(X_alcohol_test, y_alcohol_test))
ax[1][1].set_title(all_sample_title, size = 15)



for i in range (2):
    for j in range (2):
        ax[i][j].set_xlabel('Predicted label',fontsize = 15)
        ax[i][j].set_ylabel('Actual label',fontsize = 15)

fig.subplots_adjust(hspace=0.5)

In [None]:
#Plotting ROC Curves for all models
fpr1 , tpr1, thresholds1 = roc_curve(y_alcohol_test, y_alcohol_DT_pred)
auc_alcohol_DT = round(metrics.roc_auc_score(y_alcohol_test, y_alcohol_DT_pred), 4)
fpr2 , tpr2, thresholds2 = roc_curve(y_alcohol_test, y_alcohol_KNN_pred)
auc_alcohol_KNN = round(metrics.roc_auc_score(y_alcohol_test, y_alcohol_KNN_pred), 4)
fpr3 , tpr3, thresholds3 = roc_curve(y_alcohol_test, y_alcohol_RF_pred)
auc_alcohol_RF = round(metrics.roc_auc_score(y_alcohol_test, y_alcohol_RF_pred), 4)
fpr4 , tpr4, thresholds4 = roc_curve(y_alcohol_test, y_alcohol_SVM_pred)
auc_alcohol_SVM = round(metrics.roc_auc_score(y_alcohol_test, y_alcohol_SVM_pred), 4)

plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr1, tpr1, label= "DT, AUC="+str(auc_alcohol_DT))
plt.plot(fpr2, tpr2, label= "KNN, AUC="+str(auc_alcohol_KNN))
plt.plot(fpr3, tpr3, label= "RF, AUC="+str(auc_alcohol_RF))
plt.plot(fpr4, tpr4, label= "SVM, AUC="+str(auc_alcohol_SVM))

plt.legend()
plt.xlabel("False Postiive Rate")
plt.ylabel("True Positive Rate")
plt.title('Receiver Operating Characteristic')
plt.show()

# Creating 4 Models for Amphetamine User Classification

In [None]:
#Load Data in X and Y Dataframes
X_amphetamine = feature_dataset
Y_amphetamine = drug_data[["Amphetamine"]]

#Perform Train-Test Split
X_amphetamine_train, X_amphetamine_test, y_amphetamine_train, y_amphetamine_test = train_test_split(X_amphetamine , Y_amphetamine, test_size = 0.33, random_state = 42)
print("Training split input- ", X_amphetamine_train.shape)
print("Testing split input- ", X_amphetamine_test.shape)

In [None]:
#Build Decision Tree Model
amphetamine_DT_classifier=DecisionTreeClassifier()
amphetamine_DT_classifier.fit(X_amphetamine_train,y_amphetamine_train)

# Predicting the values of test data
y_amphetamine_DT_pred = amphetamine_DT_classifier.predict(X_amphetamine_test)
print("Classification report - \n", classification_report(y_amphetamine_test,y_amphetamine_DT_pred))

In [None]:
#Visualizing Tree
plt.figure(figsize = (250,250))
dec_tree = plot_tree(decision_tree=amphetamine_DT_classifier, feature_names = X_amphetamine.columns, 
                     class_names =["0", "1"] , filled = True , precision = 4, rounded = True)
plt.savefig("Decision Tree Amphetamine.png")

In [None]:
#Build KNN Model
amphetamine_KNN_classifier = KNeighborsClassifier()
amphetamine_KNN_classifier.fit(X_amphetamine_train, y_amphetamine_train.values.ravel())
y_amphetamine_KNN_pred = amphetamine_KNN_classifier.predict(X_amphetamine_test)
print("Classification report - \n", classification_report(y_amphetamine_test,y_amphetamine_KNN_pred))

In [None]:
#Build RF Model
amphetamine_RF_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
amphetamine_RF_classifier.fit(X_amphetamine_train,y_amphetamine_train.values.ravel())

y_amphetamine_RF_pred = amphetamine_RF_classifier.predict(X_amphetamine_test)
print("Classification report - \n", classification_report(y_amphetamine_test,y_amphetamine_RF_pred))

In [None]:
#Build SVM Model
amphetamine_SVM_classifier = SVC(kernel = 'linear', gamma='auto', C=2)
amphetamine_SVM_classifier.fit(X_amphetamine_train, y_amphetamine_train.values.ravel())
y_amphetamine_SVM_pred = amphetamine_SVM_classifier.predict(X_amphetamine_test)
print("Classification report - \n", classification_report(y_amphetamine_test,y_amphetamine_SVM_pred))

In [None]:
#Plotting Confusion matrix for all models
fig, ax =plt.subplots(2,2,figsize=(15,10))

fig.suptitle('Amphetamine User Confusion Matrix', fontsize=30)

amphetamine_DT_cm = confusion_matrix(y_amphetamine_test, y_amphetamine_DT_pred)
sns.heatmap(data=amphetamine_DT_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][0])
all_sample_title = 'DT Accuracy Score: {0:.4f}'.format(amphetamine_DT_classifier.score(X_amphetamine_test, y_amphetamine_test))
ax[0][0].set_title(all_sample_title, size = 15)

amphetamine_KNN_cm = confusion_matrix(y_amphetamine_test, y_amphetamine_KNN_pred)
sns.heatmap(data=amphetamine_KNN_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][1])
all_sample_title = 'KNN Accuracy Score: {0:.4f}'.format(amphetamine_KNN_classifier.score(X_amphetamine_test, y_amphetamine_test))
ax[0][1].set_title(all_sample_title, size = 15)

amphetamine_RF_cm = confusion_matrix(y_amphetamine_test, y_amphetamine_RF_pred)
sns.heatmap(data=amphetamine_RF_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][0])
all_sample_title = 'RF Accuracy Score: {0:.4f}'.format(amphetamine_RF_classifier.score(X_amphetamine_test, y_amphetamine_test))
ax[1][0].set_title(all_sample_title, size = 15)

amphetamine_SVM_cm = confusion_matrix(y_amphetamine_test, y_amphetamine_SVM_pred)
sns.heatmap(data=amphetamine_SVM_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][1])
all_sample_title = 'SVM Accuracy Score: {0:.4f}'.format(amphetamine_SVM_classifier.score(X_amphetamine_test, y_amphetamine_test))
ax[1][1].set_title(all_sample_title, size = 15)



for i in range (2):
    for j in range (2):
        ax[i][j].set_xlabel('Predicted label',fontsize = 15)
        ax[i][j].set_ylabel('Actual label',fontsize = 15)

fig.subplots_adjust(hspace=0.5)

In [None]:
#Plotting ROC Curves for all models

fpr1 , tpr1, thresholds1 = roc_curve(y_amphetamine_test, y_amphetamine_DT_pred)
auc_amphetamine_DT = round(metrics.roc_auc_score(y_amphetamine_test, y_amphetamine_DT_pred), 4)
fpr2 , tpr2, thresholds2 = roc_curve(y_amphetamine_test, y_amphetamine_KNN_pred)
auc_amphetamine_KNN = round(metrics.roc_auc_score(y_amphetamine_test, y_amphetamine_KNN_pred), 4)
fpr3 , tpr3, thresholds3 = roc_curve(y_amphetamine_test, y_amphetamine_RF_pred)
auc_amphetamine_RF = round(metrics.roc_auc_score(y_amphetamine_test, y_amphetamine_RF_pred), 4)
fpr4 , tpr4, thresholds4 = roc_curve(y_amphetamine_test, y_amphetamine_SVM_pred)
auc_amphetamine_SVM = round(metrics.roc_auc_score(y_amphetamine_test, y_amphetamine_SVM_pred), 4)

plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr1, tpr1, label= "DT, AUC="+str(auc_amphetamine_DT))
plt.plot(fpr2, tpr2, label= "KNN, AUC="+str(auc_amphetamine_KNN))
plt.plot(fpr3, tpr3, label= "RF, AUC="+str(auc_amphetamine_RF))
plt.plot(fpr4, tpr4, label= "SVM, AUC="+str(auc_amphetamine_SVM))

plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('Receiver Operating Characteristic')
plt.show()

# Creating 4 Models for Benzodiazepine User Classification

In [None]:
#Load Data in X and Y Dataframes
X_benzodiazepine = feature_dataset
Y_benzodiazepine = drug_data[["Benzodiazepine"]]

#Perform Train-Test Split
X_benzodiazepine_train, X_benzodiazepine_test, y_benzodiazepine_train, y_benzodiazepine_test = train_test_split(X_benzodiazepine , Y_benzodiazepine, test_size = 0.33, random_state = 42)
print("Training split input- ", X_benzodiazepine_train.shape)
print("Testing split input- ", X_benzodiazepine_test.shape)

In [None]:
#Build Decision Tree Model
benzodiazepine_DT_classifier=DecisionTreeClassifier()
benzodiazepine_DT_classifier.fit(X_benzodiazepine_train,y_benzodiazepine_train)

# Predicting the values of test data
y_benzodiazepine_DT_pred = benzodiazepine_DT_classifier.predict(X_benzodiazepine_test)
print("Classification report - \n", classification_report(y_benzodiazepine_test,y_benzodiazepine_DT_pred))

In [None]:
#Visualizing tree
plt.figure(figsize = (250,250))
dec_tree = plot_tree(decision_tree=benzodiazepine_DT_classifier, feature_names = X_benzodiazepine.columns, 
                     class_names =["0", "1"] , filled = True , precision = 4, rounded = True)
plt.savefig("Decision Tree Benzodiazepine.png")

In [None]:
#Build KNN Model
benzodiazepine_KNN_classifier = KNeighborsClassifier()
benzodiazepine_KNN_classifier.fit(X_benzodiazepine_train, y_benzodiazepine_train.values.ravel())
y_benzodiazepine_KNN_pred = benzodiazepine_KNN_classifier.predict(X_benzodiazepine_test)
print("Classification report - \n", classification_report(y_benzodiazepine_test,y_benzodiazepine_KNN_pred))

In [None]:
#Build RF Model
benzodiazepine_RF_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
benzodiazepine_RF_classifier.fit(X_benzodiazepine_train,y_benzodiazepine_train.values.ravel())

y_benzodiazepine_RF_pred = benzodiazepine_RF_classifier.predict(X_benzodiazepine_test)
print("Classification report - \n", classification_report(y_benzodiazepine_test,y_benzodiazepine_RF_pred))

In [None]:
#Build SVM
benzodiazepine_SVM_classifier = SVC(kernel = 'linear', gamma='auto', C=2)
benzodiazepine_SVM_classifier.fit(X_benzodiazepine_train, y_benzodiazepine_train.values.ravel())
y_benzodiazepine_SVM_pred = benzodiazepine_SVM_classifier.predict(X_benzodiazepine_test)
print("Classification report - \n", classification_report(y_benzodiazepine_test,y_benzodiazepine_SVM_pred))

In [None]:
#Plotting Confusion matrix for all models
fig, ax =plt.subplots(2,2,figsize=(15,10))

fig.suptitle('Benzodiazepine User Confusion Matrix', fontsize=30)

benzodiazepine_DT_cm = confusion_matrix(y_benzodiazepine_test, y_benzodiazepine_DT_pred)
sns.heatmap(data=benzodiazepine_DT_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][0])
all_sample_title = 'DT Accuracy Score: {0:.4f}'.format(benzodiazepine_DT_classifier.score(X_benzodiazepine_test, y_benzodiazepine_test))
ax[0][0].set_title(all_sample_title, size = 15)

benzodiazepine_KNN_cm = confusion_matrix(y_benzodiazepine_test, y_benzodiazepine_KNN_pred)
sns.heatmap(data=benzodiazepine_KNN_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][1])
all_sample_title = 'KNN Accuracy Score: {0:.4f}'.format(benzodiazepine_KNN_classifier.score(X_benzodiazepine_test, y_benzodiazepine_test))
ax[0][1].set_title(all_sample_title, size = 15)

benzodiazepine_RF_cm = confusion_matrix(y_benzodiazepine_test, y_benzodiazepine_RF_pred)
sns.heatmap(data=benzodiazepine_RF_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][0])
all_sample_title = 'RF Accuracy Score: {0:.4f}'.format(benzodiazepine_RF_classifier.score(X_benzodiazepine_test, y_benzodiazepine_test))
ax[1][0].set_title(all_sample_title, size = 15)

benzodiazepine_SVM_cm = confusion_matrix(y_benzodiazepine_test, y_benzodiazepine_SVM_pred)
sns.heatmap(data=benzodiazepine_SVM_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][1])
all_sample_title = 'SVM Accuracy Score: {0:.4f}'.format(benzodiazepine_SVM_classifier.score(X_benzodiazepine_test, y_benzodiazepine_test))
ax[1][1].set_title(all_sample_title, size = 15)


for i in range (2):
    for j in range (2):
        ax[i][j].set_xlabel('Predicted label',fontsize = 15)
        ax[i][j].set_ylabel('Actual label',fontsize = 15)

fig.subplots_adjust(hspace=0.5)

In [None]:
#Plotting ROC Curves for all models

fpr1 , tpr1, thresholds1 = roc_curve(y_benzodiazepine_test, y_benzodiazepine_DT_pred)
auc_benzodiazepine_DT = round(metrics.roc_auc_score(y_benzodiazepine_test, y_benzodiazepine_DT_pred), 4)
fpr2 , tpr2, thresholds2 = roc_curve(y_benzodiazepine_test, y_benzodiazepine_KNN_pred)
auc_benzodiazepine_KNN = round(metrics.roc_auc_score(y_benzodiazepine_test, y_benzodiazepine_KNN_pred), 4)
fpr3 , tpr3, thresholds3 = roc_curve(y_benzodiazepine_test, y_benzodiazepine_RF_pred)
auc_benzodiazepine_RF = round(metrics.roc_auc_score(y_benzodiazepine_test, y_benzodiazepine_RF_pred), 4)
fpr4 , tpr4, thresholds4 = roc_curve(y_benzodiazepine_test, y_benzodiazepine_SVM_pred)
auc_benzodiazepine_SVM = round(metrics.roc_auc_score(y_benzodiazepine_test, y_benzodiazepine_SVM_pred), 4)

plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr1, tpr1, label= "DT, AUC="+str(auc_benzodiazepine_DT))
plt.plot(fpr2, tpr2, label= "KNN, AUC="+str(auc_benzodiazepine_KNN))
plt.plot(fpr3, tpr3, label= "RF, AUC="+str(auc_benzodiazepine_RF))
plt.plot(fpr4, tpr4, label= "SVM, AUC="+str(auc_benzodiazepine_SVM))

plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('Receiver Operating Characteristic')
plt.show()

# Creating 4 Models for Cannabis User Classification

In [None]:
#Load Data in X and Y Dataframes
X_cannabis = feature_dataset
Y_cannabis = drug_data[["Cannabis"]]

#Perform Train-Test Split
X_cannabis_train, X_cannabis_test, y_cannabis_train, y_cannabis_test = train_test_split(X_cannabis , Y_cannabis, test_size = 0.33, random_state = 42)
print("Training split input- ", X_cannabis_train.shape)
print("Testing split input- ", X_cannabis_test.shape)

In [None]:
#Build Decision Tree Model
cannabis_DT_classifier=DecisionTreeClassifier()
cannabis_DT_classifier.fit(X_cannabis_train,y_cannabis_train)

# Predicting the values of test data
y_cannabis_DT_pred = cannabis_DT_classifier.predict(X_cannabis_test)
print("Classification report - \n", classification_report(y_cannabis_test,y_cannabis_DT_pred))

In [None]:
#Visualizing Tree
plt.figure(figsize = (250,250))
dec_tree = plot_tree(decision_tree=cannabis_DT_classifier, feature_names = X_cannabis.columns, 
                     class_names =["0", "1"] , filled = True , precision = 4, rounded = True)
plt.savefig("Decision Tree Cannabis.png")

In [None]:
#Build KNN Model
cannabis_KNN_classifier = KNeighborsClassifier()
cannabis_KNN_classifier.fit(X_cannabis_train, y_cannabis_train.values.ravel())
y_cannabis_KNN_pred = cannabis_KNN_classifier.predict(X_cannabis_test)
print("Classification report - \n", classification_report(y_cannabis_test,y_cannabis_KNN_pred))

In [None]:
#Build RF Model
cannabis_RF_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
cannabis_RF_classifier.fit(X_cannabis_train,y_cannabis_train.values.ravel())

y_cannabis_RF_pred = cannabis_RF_classifier.predict(X_cannabis_test)
print("Classification report - \n", classification_report(y_cannabis_test,y_cannabis_RF_pred))

In [None]:
#Build SVM Model
cannabis_SVM_classifier = SVC(kernel = 'linear', gamma='auto', C=2)
cannabis_SVM_classifier.fit(X_cannabis_train, y_cannabis_train.values.ravel())
y_cannabis_SVM_pred = cannabis_SVM_classifier.predict(X_cannabis_test)
print("Classification report - \n", classification_report(y_cannabis_test,y_cannabis_SVM_pred))

In [None]:
#Plotting Confusion matrix for all models
fig, ax =plt.subplots(2,2,figsize=(15,10))

fig.suptitle('Cannabis User Confusion Matrix', fontsize=30)

cannabis_DT_cm = confusion_matrix(y_cannabis_test, y_cannabis_DT_pred)
sns.heatmap(data=cannabis_DT_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][0])
all_sample_title = 'DT Accuracy Score: {0:.4f}'.format(cannabis_DT_classifier.score(X_cannabis_test, y_cannabis_test))
ax[0][0].set_title(all_sample_title, size = 15)

cannabis_KNN_cm = confusion_matrix(y_cannabis_test, y_cannabis_KNN_pred)
sns.heatmap(data=cannabis_KNN_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][1])
all_sample_title = 'KNN Accuracy Score: {0:.4f}'.format(cannabis_KNN_classifier.score(X_cannabis_test, y_cannabis_test))
ax[0][1].set_title(all_sample_title, size = 15)

cannabis_RF_cm = confusion_matrix(y_cannabis_test, y_cannabis_RF_pred)
sns.heatmap(data=cannabis_RF_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][0])
all_sample_title = 'RF Accuracy Score: {0:.4f}'.format(cannabis_RF_classifier.score(X_cannabis_test, y_cannabis_test))
ax[1][0].set_title(all_sample_title, size = 15)

cannabis_SVM_cm = confusion_matrix(y_cannabis_test, y_cannabis_SVM_pred)
sns.heatmap(data=cannabis_SVM_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][1])
all_sample_title = 'SVM Accuracy Score: {0:.4f}'.format(cannabis_SVM_classifier.score(X_cannabis_test, y_cannabis_test))
ax[1][1].set_title(all_sample_title, size = 15)



for i in range (2):
    for j in range (2):
        ax[i][j].set_xlabel('Predicted label',fontsize = 15)
        ax[i][j].set_ylabel('Actual label',fontsize = 15)

fig.subplots_adjust(hspace=0.5)

In [None]:
#Plotting ROC Curves for all models

fpr1 , tpr1, thresholds1 = roc_curve(y_cannabis_test, y_cannabis_DT_pred)
auc_cannabis_DT = round(metrics.roc_auc_score(y_cannabis_test, y_cannabis_DT_pred), 4)
fpr2 , tpr2, thresholds2 = roc_curve(y_cannabis_test, y_cannabis_KNN_pred)
auc_cannabis_KNN = round(metrics.roc_auc_score(y_cannabis_test, y_cannabis_KNN_pred), 4)
fpr3 , tpr3, thresholds3 = roc_curve(y_cannabis_test, y_cannabis_RF_pred)
auc_cannabis_RF = round(metrics.roc_auc_score(y_cannabis_test, y_cannabis_RF_pred), 4)
fpr4 , tpr4, thresholds4 = roc_curve(y_cannabis_test, y_cannabis_SVM_pred)
auc_cannabis_SVM = round(metrics.roc_auc_score(y_cannabis_test, y_cannabis_SVM_pred), 4)


plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr1, tpr1, label= "DT, AUC="+str(auc_cannabis_DT))
plt.plot(fpr2, tpr2, label= "KNN, AUC="+str(auc_cannabis_KNN))
plt.plot(fpr3, tpr3, label= "RF, AUC="+str(auc_cannabis_RF))
plt.plot(fpr4, tpr4, label= "SVM, AUC="+str(auc_cannabis_SVM))

plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('Receiver Operating Characteristic')
plt.show()

# Creating 4 Models for Ecstasy User Classification

In [None]:
#Load Data in X and Y Dataframes
X_ecstasy = feature_dataset
Y_ecstasy = drug_data[["Ecstasy"]]

#Perform Train-Test Split
X_ecstasy_train, X_ecstasy_test, y_ecstasy_train, y_ecstasy_test = train_test_split(X_ecstasy , Y_ecstasy, test_size = 0.33, random_state = 42)
print("Training split input- ", X_ecstasy_train.shape)
print("Testing split input- ", X_ecstasy_test.shape)

In [None]:
#Build Decision Tree Model
ecstasy_DT_classifier=DecisionTreeClassifier()
ecstasy_DT_classifier.fit(X_ecstasy_train,y_ecstasy_train)

# Predicting the values of test data
y_ecstasy_DT_pred = ecstasy_DT_classifier.predict(X_ecstasy_test)
print("Classification report - \n", classification_report(y_ecstasy_test,y_ecstasy_DT_pred))

In [None]:
#Visualizing Tree 
plt.figure(figsize = (250,250))
dec_tree = plot_tree(decision_tree=ecstasy_DT_classifier, feature_names = X_ecstasy.columns, 
                     class_names =["0", "1"] , filled = True , precision = 4, rounded = True)
plt.savefig("Decision Tree Ecstasy.png")

In [None]:
#Build KNN Model

ecstasy_KNN_classifier = KNeighborsClassifier()
ecstasy_KNN_classifier.fit(X_ecstasy_train, y_ecstasy_train.values.ravel())
y_ecstasy_KNN_pred = ecstasy_KNN_classifier.predict(X_ecstasy_test)
print("Classification report - \n", classification_report(y_ecstasy_test,y_ecstasy_KNN_pred))

In [None]:
#Build RF Model

ecstasy_RF_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
ecstasy_RF_classifier.fit(X_ecstasy_train,y_ecstasy_train.values.ravel())

y_ecstasy_RF_pred = ecstasy_RF_classifier.predict(X_ecstasy_test)
print("Classification report - \n", classification_report(y_ecstasy_test,y_ecstasy_RF_pred))

In [None]:
#Build SVM Model

ecstasy_SVM_classifier = SVC(kernel = 'linear', gamma='auto', C=2)
ecstasy_SVM_classifier.fit(X_ecstasy_train, y_ecstasy_train.values.ravel())
y_ecstasy_SVM_pred = ecstasy_SVM_classifier.predict(X_ecstasy_test)
print("Classification report - \n", classification_report(y_ecstasy_test,y_ecstasy_SVM_pred))

In [None]:
#Plotting Confusion matrix for all models
fig, ax =plt.subplots(2,2,figsize=(15,10))

fig.suptitle('Ecstasy User Confusion Matrix', fontsize=30)

ecstasy_DT_cm = confusion_matrix(y_ecstasy_test, y_ecstasy_DT_pred)
sns.heatmap(data=ecstasy_DT_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][0])
all_sample_title = 'DT Accuracy Score: {0:.4f}'.format(ecstasy_DT_classifier.score(X_ecstasy_test, y_ecstasy_test))
ax[0][0].set_title(all_sample_title, size = 15)

ecstasy_KNN_cm = confusion_matrix(y_ecstasy_test, y_ecstasy_KNN_pred)
sns.heatmap(data=ecstasy_KNN_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][1])
all_sample_title = 'KNN Accuracy Score: {0:.4f}'.format(ecstasy_KNN_classifier.score(X_ecstasy_test, y_ecstasy_test))
ax[0][1].set_title(all_sample_title, size = 15)

ecstasy_RF_cm = confusion_matrix(y_ecstasy_test, y_ecstasy_RF_pred)
sns.heatmap(data=ecstasy_RF_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][0])
all_sample_title = 'RF Accuracy Score: {0:.4f}'.format(ecstasy_RF_classifier.score(X_ecstasy_test, y_ecstasy_test))
ax[1][0].set_title(all_sample_title, size = 15)

ecstasy_SVM_cm = confusion_matrix(y_ecstasy_test, y_ecstasy_SVM_pred)
sns.heatmap(data=ecstasy_SVM_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][1])
all_sample_title = 'SVM Accuracy Score: {0:.4f}'.format(ecstasy_SVM_classifier.score(X_ecstasy_test, y_ecstasy_test))
ax[1][1].set_title(all_sample_title, size = 15)



for i in range (2):
    for j in range (2):
        ax[i][j].set_xlabel('Predicted label',fontsize = 15)
        ax[i][j].set_ylabel('Actual label',fontsize = 15)

fig.subplots_adjust(hspace=0.5)

In [None]:
#Plotting ROC Curves for all models

fpr1 , tpr1, thresholds1 = roc_curve(y_ecstasy_test, y_ecstasy_DT_pred)
auc_ecstasy_DT = round(metrics.roc_auc_score(y_ecstasy_test, y_ecstasy_DT_pred), 4)
fpr2 , tpr2, thresholds2 = roc_curve(y_ecstasy_test, y_ecstasy_KNN_pred)
auc_ecstasy_KNN = round(metrics.roc_auc_score(y_ecstasy_test, y_ecstasy_KNN_pred), 4)
fpr3 , tpr3, thresholds3 = roc_curve(y_ecstasy_test, y_ecstasy_RF_pred)
auc_ecstasy_RF = round(metrics.roc_auc_score(y_ecstasy_test, y_ecstasy_RF_pred), 4)
fpr4 , tpr4, thresholds4 = roc_curve(y_ecstasy_test, y_ecstasy_SVM_pred)
auc_ecstasy_SVM = round(metrics.roc_auc_score(y_ecstasy_test, y_ecstasy_SVM_pred), 4)

plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr1, tpr1, label= "DT, AUC="+str(auc_ecstasy_DT))
plt.plot(fpr2, tpr2, label= "KNN, AUC="+str(auc_ecstasy_KNN))
plt.plot(fpr3, tpr3, label= "RF, AUC="+str(auc_ecstasy_RF))
plt.plot(fpr4, tpr4, label= "SVM, AUC="+str(auc_ecstasy_SVM))
plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('Receiver Operating Characteristic')
plt.show()

# Creating 4 Models for Nicotine User Classification

In [None]:
#Load Data in X and Y Dataframes
X_nicotine = feature_dataset
Y_nicotine = drug_data[["Nicotine"]]

#Perform Train-Test Split
X_nicotine_train, X_nicotine_test, y_nicotine_train, y_nicotine_test = train_test_split(X_nicotine , Y_nicotine, test_size = 0.33, random_state = 42)
print("Training split input- ", X_nicotine_train.shape)
print("Testing split input- ", X_nicotine_test.shape)

In [None]:
#Build Decision Tree Model
nicotine_DT_classifier=DecisionTreeClassifier()
nicotine_DT_classifier.fit(X_nicotine_train,y_nicotine_train)

# Predicting the values of test data
y_nicotine_DT_pred = nicotine_DT_classifier.predict(X_nicotine_test)
print("Classification report - \n", classification_report(y_nicotine_test,y_nicotine_DT_pred))

In [None]:
#Visualizing Tree
plt.figure(figsize = (250,250))
dec_tree = plot_tree(decision_tree=nicotine_DT_classifier, feature_names = X_nicotine.columns, 
                     class_names =["0", "1"] , filled = True , precision = 4, rounded = True)
plt.savefig("Decision Tree Nicotine.png")

In [None]:
#Build KNN Model

nicotine_KNN_classifier = KNeighborsClassifier()
nicotine_KNN_classifier.fit(X_nicotine_train, y_nicotine_train.values.ravel())
y_nicotine_KNN_pred = nicotine_KNN_classifier.predict(X_nicotine_test)
print("Classification report - \n", classification_report(y_nicotine_test,y_nicotine_KNN_pred))

In [None]:
#Build RF Model

nicotine_RF_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
nicotine_RF_classifier.fit(X_nicotine_train,y_nicotine_train.values.ravel())

y_nicotine_RF_pred = nicotine_RF_classifier.predict(X_nicotine_test)
print("Classification report - \n", classification_report(y_nicotine_test,y_nicotine_RF_pred))

In [None]:
#Build SVM Model

nicotine_SVM_classifier = SVC(kernel = 'linear', gamma='auto', C=2)
nicotine_SVM_classifier.fit(X_nicotine_train, y_nicotine_train.values.ravel())
y_nicotine_SVM_pred = nicotine_SVM_classifier.predict(X_nicotine_test)
print("Classification report - \n", classification_report(y_nicotine_test,y_nicotine_SVM_pred))

In [None]:
#Plotting Confusion matrix for all models
fig, ax =plt.subplots(2,2,figsize=(15,10))

fig.suptitle('Nicotine User Confusion Matrix', fontsize=30)

nicotine_DT_cm = confusion_matrix(y_nicotine_test, y_nicotine_DT_pred)
sns.heatmap(data=nicotine_DT_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][0])
all_sample_title = 'DT Accuracy Score: {0:.4f}'.format(nicotine_DT_classifier.score(X_nicotine_test, y_nicotine_test))
ax[0][0].set_title(all_sample_title, size = 15)

nicotine_KNN_cm = confusion_matrix(y_nicotine_test, y_nicotine_KNN_pred)
sns.heatmap(data=nicotine_KNN_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[0][1])
all_sample_title = 'KNN Accuracy Score: {0:.4f}'.format(nicotine_KNN_classifier.score(X_nicotine_test, y_nicotine_test))
ax[0][1].set_title(all_sample_title, size = 15)

nicotine_RF_cm = confusion_matrix(y_nicotine_test, y_nicotine_RF_pred)
sns.heatmap(data=nicotine_RF_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][0])
all_sample_title = 'RF Accuracy Score: {0:.4f}'.format(nicotine_RF_classifier.score(X_nicotine_test, y_nicotine_test))
ax[1][0].set_title(all_sample_title, size = 15)

nicotine_SVM_cm = confusion_matrix(y_nicotine_test, y_nicotine_SVM_pred)
sns.heatmap(data=nicotine_SVM_cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues', fmt='g', ax=ax[1][1])
all_sample_title = 'SVM Accuracy Score: {0:.4f}'.format(nicotine_SVM_classifier.score(X_nicotine_test, y_nicotine_test))
ax[1][1].set_title(all_sample_title, size = 15)



for i in range (2):
    for j in range (2):
        ax[i][j].set_xlabel('Predicted label',fontsize = 15)
        ax[i][j].set_ylabel('Actual label',fontsize = 15)

fig.subplots_adjust(hspace=0.5)

In [None]:
#Plotting ROC Curves for all models

fpr1 , tpr1, thresholds1 = roc_curve(y_nicotine_test, y_nicotine_DT_pred)
auc_nicotine_DT = round(metrics.roc_auc_score(y_nicotine_test, y_nicotine_DT_pred), 4)
fpr2 , tpr2, thresholds2 = roc_curve(y_nicotine_test, y_nicotine_KNN_pred)
auc_nicotine_KNN = round(metrics.roc_auc_score(y_nicotine_test, y_nicotine_KNN_pred), 4)
fpr3 , tpr3, thresholds3 = roc_curve(y_nicotine_test, y_nicotine_RF_pred)
auc_nicotine_RF = round(metrics.roc_auc_score(y_nicotine_test, y_nicotine_RF_pred), 4)
fpr4 , tpr4, thresholds4 = roc_curve(y_nicotine_test, y_nicotine_SVM_pred)
auc_nicotine_SVM = round(metrics.roc_auc_score(y_nicotine_test, y_nicotine_SVM_pred), 4)

plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr1, tpr1, label= "DT, AUC="+str(auc_nicotine_DT))
plt.plot(fpr2, tpr2, label= "KNN, AUC="+str(auc_nicotine_KNN))
plt.plot(fpr3, tpr3, label= "RF, AUC="+str(auc_nicotine_RF))
plt.plot(fpr4, tpr4, label= "SVM, AUC="+str(auc_nicotine_SVM))

plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('Receiver Operating Characteristic')
plt.show()