In [150]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_iris

<h1> Modifying 1_12 to fit alcohol dataset</h1>

In [152]:
# Use pandas to read the CSV file into a DataFrame
df = pd.read_csv("./ai1/datasets/dataset_alcohol.csv")

In [153]:
df.shape

(76, 9)

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age_yrs        76 non-null     int64  
 1   height_cm      76 non-null     int64  
 2   weight_kg      76 non-null     int64  
 3   duration_mins  76 non-null     object 
 4   elapsed_mins   76 non-null     object 
 5   sex            76 non-null     object 
 6   last_meal      65 non-null     object 
 7   units          76 non-null     float64
 8   over_limit     76 non-null     object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.5+ KB


In [155]:
df.describe(include="all")

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
count,76.0,76.0,76.0,76.0,76.0,76,65,76.0,76
unique,,,,16.0,9.0,2,4,,2
top,,,,120.0,10.0,Male,Full,,No
freq,,,,15.0,61.0,60,33,,46
mean,22.657895,176.644737,71.486842,,,,,8.632895,
std,5.627439,8.453329,11.474602,,,,,5.775567,
min,18.0,157.0,47.0,,,,,0.0,
25%,19.0,172.0,63.0,,,,,4.275,
50%,21.0,177.0,72.0,,,,,8.4,
75%,23.0,182.0,79.0,,,,,12.1,


In [156]:
df.head()

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
0,40,170,75,?,?,Male,Lunch,0.0,No
1,26,177,76,60,10,Male,Full,2.9,No
2,24,160,60,60,10,Female,Full,2.6,No
3,29,160,63,90,10,Female,Full,1.2,No
4,23,182,63,120,10,Male,Full,5.2,No


In [157]:
# The features
features = ["age_yrs", "height_cm", "weight_kg", "duration_mins", "elapsed_mins", "sex", "last_meal", "units"]

# split features into nominal vs numeric
numeric_features = ["age_yrs", "height_cm", "weight_kg", "units" , "duration_mins", "elapsed_mins"]
nominal_features = ["sex", "last_meal"]

In [158]:
# The values, in the case of nominal-valued features
for feature in nominal_features:
    print(feature, df[feature].unique())

sex ['Male' 'Female']
last_meal ['Lunch' 'Full' 'Snack' '?' nan]


In [159]:
# Delete examples with Nan
# could also reaplce them with imputer later, must replace ? to NAN (was not right move here, both were non-drinkers)
#df.dropna(subset=["last_meal"], inplace=True) << dont do because not much data

#delete question marks
df = (df[(df["duration_mins"] != '?') & (df["elapsed_mins"] != '?') & (df["last_meal"] != '?')]).copy()

#could use df['duration_mins'] = df['duration_mins'].astype('int64')

# Reset the index
df.reset_index(drop=True, inplace=True)

# Check the invalid data was filtered out
df.shape

(73, 9)

In [160]:
# Split off the test set: 20% of the dataset. Note the stratification
dev_df, test_df = train_test_split(df, train_size=0.8, stratify=df["over_limit"], random_state=2)

In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age_yrs        73 non-null     int64  
 1   height_cm      73 non-null     int64  
 2   weight_kg      73 non-null     int64  
 3   duration_mins  73 non-null     object 
 4   elapsed_mins   73 non-null     object 
 5   sex            73 non-null     object 
 6   last_meal      62 non-null     object 
 7   units          73 non-null     float64
 8   over_limit     73 non-null     object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.3+ KB


In [162]:
df.head()

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
0,26,177,76,60,10,Male,Full,2.9,No
1,24,160,60,60,10,Female,Full,2.6,No
2,29,160,63,90,10,Female,Full,1.2,No
3,23,182,63,120,10,Male,Full,5.2,No
4,19,165,51,120,10,Female,Lunch,5.2,Yes


In [163]:
df["duration_mins"].unique()

array(['60', '90', '120', '150', '240', '30', '270', '180', '330', '435',
       '325', '300', '360', '315', '5'], dtype=object)

In [164]:
# Create the preprocessor
preprocessor = ColumnTransformer([
        ("scaler", StandardScaler(), 
                  numeric_features),
        ("nom", Pipeline([("imputer", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
                         ("binarizer", OneHotEncoder(handle_unknown="ignore"))]), 
                nominal_features)],
        remainder="passthrough")

In [165]:
# Extract the features but leave as a DataFrame
dev_X = dev_df[features]
test_X = test_df[features]

# Target values, encoded and converted to a 1D numpy array
label_encoder = LabelEncoder()
label_encoder.fit(df["over_limit"])
dev_y = label_encoder.transform(dev_df["over_limit"])
test_y = label_encoder.transform(test_df["over_limit"])

In [166]:
# Let's see how a majority-class classifier performs.
# This is our baseline. We need to do better than this!

maj = DummyClassifier()
maj.fit(dev_X, dev_y)
accuracy_score(test_y, maj.predict(test_X))

0.6

In [167]:
# Create a pipeline that combines the preprocessor with kNN
knn = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", KNeighborsClassifier())])

# Create a dictionary of hyperparameters for kNN
knn_param_grid = {"predictor__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

# Create the grid search object which will find the best hyperparameter values based on validation error
knn_gs = GridSearchCV(knn, knn_param_grid, scoring="accuracy", cv=10, refit=True)

# Run grid search by calling fit. It will also re-train on train+validation using the best parameters.
knn_gs.fit(dev_X, dev_y)

# Let's see how well we did
knn_gs.best_params_, knn_gs.best_score_

({'predictor__n_neighbors': 10}, 0.8300000000000001)

In [168]:
# Create a pipeline that combines the preprocessor with logistic regression
logistic = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", LogisticRegression())])

# We should do a grid search to set hyperparameter C. But, for brevity, we won't!

# Create the grid search object which will find the best hyperparameter values based on validation error
np.mean(cross_val_score(logistic, dev_X, dev_y, scoring="accuracy", cv=10))

0.7633333333333334

In [169]:
knn.set_params(**knn_gs.best_params_) 
scores = cross_validate(knn, dev_X, dev_y, cv=10, 
                        scoring="accuracy", return_train_score=True)
print("Training accuracy: ", np.mean(scores["train_score"]))
print("Validation accuracy: ", np.mean(scores["test_score"]))

Training accuracy:  0.835232220609579
Validation accuracy:  0.8300000000000001


In [170]:
accuracy_score(test_y, knn_gs.predict(test_X))

0.9333333333333333