In [38]:
import pandas as pd
import numpy as np

cols = ['ID', 'Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'N-Score', 'E-Score', 'O-Score', 'A-Score', 'C-Score', 'Impulsive', 'ImpSS', 'Alcohol', 'Amphet',
        'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']

drugs_df = pd.read_csv('./data/drug_consumption.data', names=cols)
drugs_df.pop('ID')
drugs_df.pop('Alcohol')
drugs_df.pop('Crack')
drugs_df.pop('Semer')
drugs_df.pop('VSA')
drugs_df.pop('Choc')
drugs_df.pop('Caff')
drugs_df.pop('Amyl')
drugs_df.pop('Heroin')
drugs_df.head()

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,N-Score,E-Score,O-Score,A-Score,C-Score,...,Benzos,Cannabis,Coke,Ecstasy,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine
0,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL2,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL0,CL4,CL3,CL4,CL2,CL0,CL2,CL3,CL0,CL4
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL3,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL3,CL2,CL2,CL0,CL2,CL0,CL0,CL0,CL0,CL2
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL0,CL3,CL0,CL1,CL0,CL1,CL0,CL0,CL2,CL2


In [42]:
cleanUserStatuses = ['CL0', 'CL1', 'CL2']

start_column = 'Amphet'

start_index = drugs_df.columns.get_loc(start_column)

drug_using_features = drugs_df[['Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'N-Score', 'E-Score', 'O-Score', 'A-Score', 'C-Score', 'Impulsive', 'ImpSS']]

print(drug_using_features.head())

drug_using_labels = pd.DataFrame()

for col_index in range(start_index, len(drugs_df.columns)):
    column_name = drugs_df.columns[col_index]

    drug_using_labels[f'Drug_User_{column_name}'] = np.where(drugs_df[column_name].isin(cleanUserStatuses), 0, 1)

drug_using_labels['Drug_User_Amphet'].value_counts()

drug_using_labels.head()

print(drug_using_labels.columns)

print(drug_using_features.shape)
print(drug_using_labels.shape)

       Age   Gender  Education  Country  Ethnicity  N-Score  E-Score  O-Score  \
0  0.49788  0.48246   -0.05921  0.96082    0.12600  0.31287 -0.57545 -0.58331   
1 -0.07854 -0.48246    1.98437  0.96082   -0.31685 -0.67825  1.93886  1.43533   
2  0.49788 -0.48246   -0.05921  0.96082   -0.31685 -0.46725  0.80523 -0.84732   
3 -0.95197  0.48246    1.16365  0.96082   -0.31685 -0.14882 -0.80615 -0.01928   
4  0.49788  0.48246    1.98437  0.96082   -0.31685  0.73545 -1.63340 -0.45174   

   A-Score  C-Score  Impulsive    ImpSS  
0 -0.91699 -0.00665   -0.21712 -1.18084  
1  0.76096 -0.14277   -0.71126 -0.21575  
2 -1.62090 -1.01450   -1.37983  0.40148  
3  0.59042  0.58489   -1.37983 -1.18084  
4 -0.30172  1.30612   -0.21712 -0.21575  
Index(['Drug_User_Amphet', 'Drug_User_Benzos', 'Drug_User_Cannabis',
       'Drug_User_Coke', 'Drug_User_Ecstasy', 'Drug_User_Ketamine',
       'Drug_User_Legalh', 'Drug_User_LSD', 'Drug_User_Meth',
       'Drug_User_Mushrooms', 'Drug_User_Nicotine'],
      dty

In [43]:
params = dict()

params['n_estimators'] = (50, 300)
params['max_depth'] = (5, 20)
params['min_samples_split'] = (2, 10)
params['min_samples_leaf'] = (1, 10)
params['max_features'] = (0.1, 1.0)
params['criterion'] = ['gini', 'entropy']

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(drug_using_features, drug_using_labels, test_size=0.2, random_state=42)

label_classes = y_train.columns

full_drug_classifier = {}

for label in label_classes:
    print(f'FINDING BEST MODEL FOR --> {label}')
    drug_classifier = RandomForestClassifier()
    crossVal = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    drug_labels = y_train[label]
    
    hpSearch = BayesSearchCV(estimator=drug_classifier, search_spaces=params, n_jobs=6, n_iter=30, cv= crossVal, verbose=2)
    
    hpSearch.fit(X_train, drug_labels)
    
    best_model = hpSearch.best_estimator_
    
    full_drug_classifier[label] = best_model
    
    print(f'BEST MODEL FOUND FOR --> {label}')

FINDING BEST MODEL FOR --> Drug_User_Amphet
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 fo

In [45]:
y_pred = pd.DataFrame()

for label in label_classes:
    drug_classifier = full_drug_classifier[label]
    
    y_pred_for_drug = drug_classifier.predict(X_test)
    
    y_pred[label] = y_pred_for_drug

In [46]:
print(y_test.shape)
print(y_pred.shape)

(377, 11)
(377, 11)


In [48]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

print(y_pred.shape)
print(y_test.shape)


for i in range(0, len(y_test.columns)):
    print(f'REPORT FOR {i} --> {y_test.columns[i]}')

    drug_true_labels = y_test[y_test.columns[i]]

    drug_predicted_labels = y_pred[y_test.columns[i]]

    print(f"Accuracy: {accuracy_score(drug_true_labels, drug_predicted_labels)}")
    print(f"Precision: {precision_score(drug_true_labels, drug_predicted_labels)}")
    print(f"Recall: {recall_score(drug_true_labels, drug_predicted_labels)}")
    print(confusion_matrix(drug_true_labels, drug_predicted_labels))

    print(f'\n{classification_report(drug_true_labels, drug_predicted_labels)}\n')

    

(377, 11)
(377, 11)
REPORT FOR 0 --> Drug_User_Amphet
Accuracy: 0.8037135278514589
Precision: 0.5769230769230769
Recall: 0.36585365853658536
[[273  22]
 [ 52  30]]

              precision    recall  f1-score   support

           0       0.84      0.93      0.88       295
           1       0.58      0.37      0.45        82

    accuracy                           0.80       377
   macro avg       0.71      0.65      0.66       377
weighted avg       0.78      0.80      0.79       377


REPORT FOR 1 --> Drug_User_Benzos
Accuracy: 0.7161803713527851
Precision: 0.5666666666666667
Recall: 0.2956521739130435
[[236  26]
 [ 81  34]]

              precision    recall  f1-score   support

           0       0.74      0.90      0.82       262
           1       0.57      0.30      0.39       115

    accuracy                           0.72       377
   macro avg       0.66      0.60      0.60       377
weighted avg       0.69      0.72      0.69       377


REPORT FOR 2 --> Drug_User_Cannabis