In [2]:
import numpy as np
import pandas as pd
import pickle
import json
from io import StringIO
setattr(pd, "Int64Index", pd.Index)
setattr(pd, "Float64Index", pd.Index)
import matplotlib.pyplot as plt
from plotly import express
from arrow import now
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from flaml import AutoML, tune
import os 
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

## Importing the data 

In [4]:
file_name = "data/drug_consumption.csv"
file_path = os.path.join(os.getcwd(), file_name)  
df = pd.read_csv(file_path)


Make a folder to store all of the trained models from the experiment

In [5]:
try:
    if not os.path.exists('trained_models'):
            os.makedirs('trained_models')
except OSError:
    pass

EDA

In [6]:
print(df.head())
print(df.describe) #(1885, 32)
print(df.info())

   ID      Age   Gender  Education  Country  Ethnicity   Nscore   Escore  \
0   1  0.49788  0.48246   -0.05921  0.96082    0.12600  0.31287 -0.57545   
1   2 -0.07854 -0.48246    1.98437  0.96082   -0.31685 -0.67825  1.93886   
2   3  0.49788 -0.48246   -0.05921  0.96082   -0.31685 -0.46725  0.80523   
3   4 -0.95197  0.48246    1.16365  0.96082   -0.31685 -0.14882 -0.80615   
4   5  0.49788  0.48246    1.98437  0.96082   -0.31685  0.73545 -1.63340   

    Oscore   Ascore  ...  Ecstasy  Heroin  Ketamine Legalh  LSD Meth  \
0 -0.58331 -0.91699  ...      CL0     CL0       CL0    CL0  CL0  CL0   
1  1.43533  0.76096  ...      CL4     CL0       CL2    CL0  CL2  CL3   
2 -0.84732 -1.62090  ...      CL0     CL0       CL0    CL0  CL0  CL0   
3 -0.01928  0.59042  ...      CL0     CL0       CL2    CL0  CL0  CL0   
4 -0.45174 -0.30172  ...      CL1     CL0       CL0    CL1  CL0  CL0   

  Mushrooms Nicotine Semer  VSA  
0       CL0      CL2   CL0  CL0  
1       CL0      CL4   CL0  CL0  
2       

Splitting the columns to categorical and numerical columns

In [7]:
cols = [
       # Numeric columns 
       'ID', 'Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Nscore',
       'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS',
       # Categoriocal Columns
       'Alcohol','Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 
       'Crack','Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms',
       'Nicotine', 'Semer', 'VSA']

float_columns =  ['Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Nscore',
                  'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS',]

# we do not have enough positives for Semer to be meaningful so we're going to have to leave it out
targets = ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy',
           'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']

Label Encoding for tree based classification

In [8]:
label_encoder = LabelEncoder()
for cat in targets:
    df[cat] = label_encoder.fit_transform(df[cat])

Because this is a small dataset the time to train each model is small as well, you should not need more than 1 second to train a model.

In [9]:

# Store each model of the target variable into a dictionary
model_output = {}

#auto_ml(df, targets)
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(df[float_columns], df[target], test_size=0.20, random_state=2024)

    automl = AutoML()

    automl_settings = {
        "time_budget": 1,  # In seconds
        "metric": "accuracy",
        "task": "classification",
        "log_file_name": 'mylog.log',
    }

    # fit classification on the training data
    clf = automl.fit(X_train=X_train, y_train=y_train, **automl_settings)

    # Export the best model
    retrained_model = automl.model
    print(automl.model)

    model_output[target] = retrained_model.estimator
    
    # Save the model to a file
    with open(f'./trained_models/{target}_model.pkl', 'wb') as f:
        pickle.dump(retrained_model.estimator, f)

    # Load the model from the file
    with open(f'./trained_models/{target}_model.pkl', 'rb') as f:
        retrained_model = pickle.load(f)

print(model_output)

[flaml.automl.logger: 04-01 16:17:38] {1680} INFO - task = classification
[flaml.automl.logger: 04-01 16:17:38] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-01 16:17:38] {1789} INFO - Minimizing error metric: mse
[flaml.automl.logger: 04-01 16:17:38] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 04-01 16:17:38] {2219} INFO - iteration 0, current learner lgbm


TypeError: 'NoneType' object is not callable

In [None]:

time_start = now()

# Make sure that you have the same split as which the models were trained on
X_train, X_test, y_train, y_test = train_test_split(df[float_columns], df[target], test_size=0.20, random_state=2024)

accuracy_results = {}
f1_results = {}

for target in targets:
    model = model_output[target].fit(X=X_train, y=y_train)
    f1_results[target] = f1_score(y_true=y_test, y_pred=model.predict(X_test), average='weighted')
    accuracy_results[target] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))


In [None]:
print(accuracy_results)
print(f1_results)

In [None]:
label = list(f1_results.keys())
f1_score_results = list(f1_results.values())

plt.figure(figsize=(8, 6))
plt.bar(label, f1_score_results)
plt.xlabel('')
plt.ylabel('f1-score')
plt.title('F1-Scores for each Drug')
plt.show()
