In [67]:
import pandas as pd
import numpy as np

cols = ['ID', 'Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'N-Score', 'E-Score', 'O-Score', 'A-Score', 'C-Score', 'Impulsive', 'ImpSS', 'Alcohol', 'Amphet',
        'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']

drugs_df = pd.read_csv('./data/drug_consumption.data', names=cols)
drugs_df.pop('ID')
drugs_df.head()

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,N-Score,E-Score,O-Score,A-Score,C-Score,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [68]:
cleanUserStatuses = ['CL0', 'CL1', 'CL2']

start_column = 'Alcohol'

start_index = drugs_df.columns.get_loc(start_column)

drug_using_features = drugs_df[['Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'N-Score', 'E-Score', 'O-Score', 'A-Score', 'C-Score', 'Impulsive', 'ImpSS']]

print(drug_using_features.head())

drug_using_labels = pd.DataFrame()

for col_index in range(start_index, len(drugs_df.columns)):
    column_name = drugs_df.columns[col_index]

    drug_using_labels[f'Drug_User_{column_name}'] = np.where(drugs_df[column_name].isin(cleanUserStatuses), 0, 1)

drug_using_labels['Drug_User_Alcohol'].value_counts()

drug_using_labels['Drug_User_Amphet'].value_counts()

drug_using_labels['Drug_User_Choc'].value_counts()

drug_using_labels.head()

drug_using_labels.columns

print(drug_using_features.shape)
print(drug_using_labels.shape)

       Age   Gender  Education  Country  Ethnicity  N-Score  E-Score  O-Score  \
0  0.49788  0.48246   -0.05921  0.96082    0.12600  0.31287 -0.57545 -0.58331   
1 -0.07854 -0.48246    1.98437  0.96082   -0.31685 -0.67825  1.93886  1.43533   
2  0.49788 -0.48246   -0.05921  0.96082   -0.31685 -0.46725  0.80523 -0.84732   
3 -0.95197  0.48246    1.16365  0.96082   -0.31685 -0.14882 -0.80615 -0.01928   
4  0.49788  0.48246    1.98437  0.96082   -0.31685  0.73545 -1.63340 -0.45174   

   A-Score  C-Score  Impulsive    ImpSS  
0 -0.91699 -0.00665   -0.21712 -1.18084  
1  0.76096 -0.14277   -0.71126 -0.21575  
2 -1.62090 -1.01450   -1.37983  0.40148  
3  0.59042  0.58489   -1.37983 -1.18084  
4 -0.30172  1.30612   -0.21712 -0.21575  
(1885, 12)
(1885, 19)


In [71]:
params = dict()

params['n_estimators'] = (50, 300)
params['max_depth'] = (5, 20)
params['min_samples_split'] = (2, 10)
params['min_samples_leaf'] = (1, 10)
params['max_features'] = (0.1, 1.0)
params['criterion'] = ['gini', 'entropy']

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

X_train, X_test, y_train, y_test = train_test_split(drug_using_features, drug_using_labels)

forest_classifier = RandomForestClassifier(n_estimators=100)

multi_target_forest_classifier = MultiOutputClassifier(forest_classifier, n_jobs=1)

crossVal = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

hpSearch = BayesSearchCV(estimator=multi_target_forest_classifier, search_spaces= params , n_jobs=6, cv= crossVal, n_iter=20, verbose=2)

hpSearch.fit(X_train, y_train)



(12, 27)
(19, 27)


ValueError: Found input variables with inconsistent numbers of samples: [12, 19]

In [61]:
y_pred = multi_target_forest_classifier.predict(X_test)

In [64]:
from sklearn.metrics import hamming_loss, accuracy_score, jaccard_score, f1_score, precision_score, recall_score

# Assuming y_test and y_pred are the true and predicted labels, respectively
hamming = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
jaccard = jaccard_score(y_test, y_pred, average='samples')  # 'samples' for multi-label
f1 = f1_score(y_test, y_pred, average='samples')
precision = precision_score(y_test, y_pred, average='samples')
recall = recall_score(y_test, y_pred, average='samples')

print(f"Hamming Loss: {hamming}")
print(f"Accuracy Score: {accuracy}")
print(f"Jaccard Score: {jaccard}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Hamming Loss: 0.14339875111507583
Accuracy Score: 0.2033898305084746
Jaccard Score: 0.6824876367124622
F1 Score: 0.7911599553973747
Precision: 0.8483423356304711
Recall: 0.7915898190792259


  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

print(y_pred.shape)
print(y_test.shape)

transposed_predictions = y_pred.T

for i in range(0, len(y_test.columns)):
    print(f'REPORT FOR --> {y_test.columns[i]}')

    drug_true_labels = y_test[y_test.columns[i]]

    drug_predicted_labels = transposed_predictions[i]

    # print(f"Accuracy: {accuracy_score(drug_true_labels, drug_predicted_labels)}")
    # print(f"Precision: {precision_score(drug_true_labels, drug_predicted_labels)}")
    # print(f"Recall: {recall_score(drug_true_labels, drug_predicted_labels)}")
    # print(confusion_matrix(drug_true_labels, drug_predicted_labels))

    print(f'\n{classification_report(drug_true_labels, drug_predicted_labels)}\n')

    

(472, 19)
(472, 19)
REPORT FOR --> Drug_User_Alcohol

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        36
           1       0.92      1.00      0.96       436

    accuracy                           0.92       472
   macro avg       0.46      0.50      0.48       472
weighted avg       0.85      0.92      0.89       472


REPORT FOR --> Drug_User_Amphet

              precision    recall  f1-score   support

           0       0.80      0.89      0.84       356
           1       0.48      0.30      0.37       116

    accuracy                           0.75       472
   macro avg       0.64      0.60      0.61       472
weighted avg       0.72      0.75      0.73       472


REPORT FOR --> Drug_User_Amyl

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       437
           1       0.00      0.00      0.00        35

    accuracy                           0.93       472
   macro

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr