# Improving the Worst Possum Classification Model

In [18]:
import pandas as pd
import numpy as np

#sci kit functions used
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel

# models used
import sklearn.neighbors as neigh
from sklearn.ensemble import RandomForestClassifier

import acquire
import prepare
import model_target_sex

RANDOM_SEED = prepare.RAND_SEED
MODEL_COLUMNS = model_target_sex.MODEL_STRATEGY_DICTIONARY['measurement_only']

## Rebuilding the orignal data set

Note that `total_length` is a calculated column.

In [2]:
#get only the measurement columns with the id column and the target column
df = acquire.make_dataset()
df = model_target_sex.make_modeling_columns(df)
df = df[['case', 'sex'] + MODEL_COLUMNS]

In [3]:
df.sample(3)

Unnamed: 0,case,sex,total_length,body_length,tail_length,head_length,skull_width,foot_length,eye_width,chest_girth,belly_girth,ear_length
11,12,female,920.0,470.1,355.0,94.9,55.6,71.7,15.3,280.0,330.0,51.0
87,88,female,875.0,405.4,380.0,89.6,58.0,66.7,16.0,255.0,315.0,43.5
75,76,male,890.0,387.6,410.0,92.4,56.8,64.5,17.8,260.0,330.0,46.4


## Rebuilding the original train sets

In [4]:
#now split into sets
train, validate, test = prepare.split_data(df)
prepare.make_sex_distribution_df(train, validate, test)

Unnamed: 0,dataset,proportion_male,proportion_female,total_male,total_female
0,train,0.585714,0.414286,41,29
1,validate,0.571429,0.428571,20,15
2,test,0.6,0.4,21,14


In [5]:
train.sample(3)

Unnamed: 0,case,sex,total_length,body_length,tail_length,head_length,skull_width,foot_length,eye_width,chest_girth,belly_girth,ear_length
14,15,male,855.0,422.1,340.0,92.9,57.6,69.7,15.7,280.0,350.0,51.8
63,64,female,895.0,413.1,385.0,96.9,56.5,63.0,17.1,255.0,330.0,45.1
13,14,male,915.0,459.6,360.0,95.4,57.6,74.3,15.1,280.0,315.0,53.7


## Improving The Nearest Centroid Classifier

The worst performing model was the Nearest Centroid Classifier model.  It was more accurate than baseline; however, it was only beating baseline by about 2 points.

In [6]:
#and make X and y
col_to_scale = train.select_dtypes(include='float').columns.tolist()
X, y, X_val, y_val = model_target_sex.make_X_and_y(train, validate, col_to_scale)
X[:3]

array([[-0.6915904 , -0.30548655, -0.91278063,  0.12079368,  3.61801315,
        -0.63145449, -0.51390641,  0.72124787, -0.27531743, -0.93083938],
       [ 0.52783852,  0.27135948,  0.52845194,  0.6200148 , -0.22495419,
         1.23590706,  0.41086091, -0.07174717,  1.22155403,  1.04736874],
       [ 1.25949587,  0.59426785,  1.72947909, -1.03365516, -0.25619782,
        -0.63145449, -0.51390641, -0.07174717, -0.46242636, -1.12621795]])

In [7]:
#the worst performing model was the nearest centroid classifier
orig_model = model_target_sex.make_nearest_centroid_model(X, y, X_val, y_val, baseline_acc = 0.58)
pd.DataFrame([orig_model])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['predicted'] = nc.predict(X_train)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val['predicted'] = nc.predict(X_val)


Unnamed: 0,model,train_accuracy,validate_accuracy,accuracy_change,better_than_baseline
0,Nearest Centroid Classification,0.671429,0.6,0.071429,True


### Using a pipeline to hyperparameter tune the nearest centroid

Only a train and test dataset are used; therefore, the original data is re-split to combine the size of the validate and train.  This means that the test data represents 0.15 of the total data.  Although this makes the model comparison between the original model and the new models more subjective, the point of the excercise is to explore model building, not necessarily improve the model.

In [8]:
train, test = train_test_split(df, test_size = 0.15, stratify = df[['sex']], random_state = RANDOM_SEED)

#make the X and y again
X = train[MODEL_COLUMNS]
y = train['sex']
X_test = test[MODEL_COLUMNS]
y_test = test['sex']

pipe_nc = make_pipeline(StandardScaler(), neigh.NearestCentroid())
pipe_nc.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('nearestcentroid', NearestCentroid())])

Note that there is a high drop off in the accuracy.  This indicates high variance/overfitting

In [9]:
y_pred = pipe_nc.predict(X)
print(f'Train accuracy: {pipe_nc.score(X, y)}')
print(f'Test accuracy: {pipe_nc.score(X_test, y_test)}')

Train accuracy: 0.6823529411764706
Test accuracy: 0.4375


### Using random forests to assess feature importance

The nearest centroid classifier cannot be regularized; therefore, use a random forest to extract the most important features in order to attempt to reduce the variance.

In [27]:
feat_labels = df.columns[2:] #extract the model features
#make and fit a random forest
forest = RandomForestClassifier(random_state = RANDOM_SEED)
forest.fit(X, y)
#get feature importance
importance = forest.feature_importances_
indices = np.argsort(importance)[::-1] #argsort gives the indices of the sorted array
for f in range(X.shape[1]):
    print(f'{f} {feat_labels[f]} - {importance[indices[f]]}')

0 total_length - 0.1345654110699876
1 body_length - 0.12851492874873754
2 tail_length - 0.12496915756449375
3 head_length - 0.11225451446670975
4 skull_width - 0.10962931336950105
5 foot_length - 0.09313321532988027
6 eye_width - 0.09058792904149335
7 chest_girth - 0.0807759898900699
8 belly_girth - 0.06436621449659412
9 ear_length - 0.0612033260225326


When fit to data, the model peaks at an accuracy of 0.62 for unseen data using 7 features. This is a significant improvement over the original model accuracy using all of the features.

In [52]:
#get a list of these features
for i in range(1, indices.shape[0]+1):
    X_feat_extracted = X.iloc[:, [x for x in indices if x < i]]
    X_test_feat_extracted = X_test.iloc[:, [x for x in indices if x < i]]

    pipe_nc.fit(X_feat_extracted, y)
    y_pred = pipe_nc.predict(X_feat_extracted)
    print(f'{"-"*5} \nNumber of features: {i}')
    print(f'Train accuracy: {pipe_nc.score(X_feat_extracted, y)}')
    print(f'Test accuracy: {pipe_nc.score(X_test_feat_extracted, y_test)}')

----- 
Number of features: 1
Train accuracy: 0.6
Test accuracy: 0.5
----- 
Number of features: 2
Train accuracy: 0.5647058823529412
Test accuracy: 0.375
----- 
Number of features: 3
Train accuracy: 0.5882352941176471
Test accuracy: 0.375
----- 
Number of features: 4
Train accuracy: 0.6470588235294118
Test accuracy: 0.5
----- 
Number of features: 5
Train accuracy: 0.6
Test accuracy: 0.5625
----- 
Number of features: 6
Train accuracy: 0.6352941176470588
Test accuracy: 0.5625
----- 
Number of features: 7
Train accuracy: 0.6941176470588235
Test accuracy: 0.625
----- 
Number of features: 8
Train accuracy: 0.6941176470588235
Test accuracy: 0.625
----- 
Number of features: 9
Train accuracy: 0.6823529411764706
Test accuracy: 0.5625
----- 
Number of features: 10
Train accuracy: 0.6823529411764706
Test accuracy: 0.4375
