# Improving the Possum Classification Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#sci kit functions used
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold

# models used
import sklearn.neighbors as neigh
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import acquire
import prepare
import model_target_sex

RANDOM_SEED = prepare.RAND_SEED
MODEL_COLUMNS = model_target_sex.MODEL_STRATEGY_DICTIONARY['measurement_only']

## Rebuilding the orignal data set

Note that `total_length` is a calculated column.

In [2]:
#get only the measurement columns with the id column and the target column
df = acquire.make_dataset()
df = model_target_sex.make_modeling_columns(df)
df = df[['case', 'sex'] + MODEL_COLUMNS]

In [3]:
df.sample(3)

Unnamed: 0,case,sex,total_length,body_length,tail_length,head_length,skull_width,foot_length,eye_width,chest_girth,belly_girth,ear_length
38,39,female,750.0,325.3,340.0,84.7,51.5,68.7,13.0,250.0,250.0,53.4
96,97,male,860.0,389.0,380.0,91.0,53.1,63.8,14.5,250.0,315.0,46.0
102,103,male,825.0,368.5,365.0,91.5,55.2,62.9,15.4,250.0,290.0,45.9


## Rebuilding the original train sets

In [4]:
#now split into sets
train, validate, test = prepare.split_data(df)
prepare.make_sex_distribution_df(train, validate, test)

Unnamed: 0,dataset,proportion_male,proportion_female,total_male,total_female
0,train,0.585714,0.414286,41,29
1,validate,0.571429,0.428571,20,15
2,test,0.6,0.4,21,14


In [5]:
train.sample(3)

Unnamed: 0,case,sex,total_length,body_length,tail_length,head_length,skull_width,foot_length,eye_width,chest_girth,belly_girth,ear_length
69,70,female,870.0,398.1,380.0,91.9,56.4,65.4,13.0,270.0,340.0,44.1
15,16,male,860.0,423.4,345.0,91.6,56.0,73.0,14.4,280.0,320.0,51.4
83,84,male,805.0,356.6,360.0,88.4,54.6,62.6,16.3,250.0,285.0,43.6


## Improving The Nearest Centroid Classifier

#### Key Takeaways
- The original model has high variance.
- Reducing features significantly improves the model's performance on unseen data

The worst performing model was the Nearest Centroid Classifier model.  It was more accurate than baseline; however, it was only beating baseline by about 2 points.

In [6]:
#and make X and y
col_to_scale = train.select_dtypes(include='float').columns.tolist()
X, y, X_val, y_val = model_target_sex.make_X_and_y(train, validate, col_to_scale)
X[:3]

array([[-0.6915904 , -0.30548655, -0.91278063,  0.12079368,  3.61801315,
        -0.63145449, -0.51390641,  0.72124787, -0.27531743, -0.93083938],
       [ 0.52783852,  0.27135948,  0.52845194,  0.6200148 , -0.22495419,
         1.23590706,  0.41086091, -0.07174717,  1.22155403,  1.04736874],
       [ 1.25949587,  0.59426785,  1.72947909, -1.03365516, -0.25619782,
        -0.63145449, -0.51390641, -0.07174717, -0.46242636, -1.12621795]])

In [7]:
#the worst performing model was the nearest centroid classifier
orig_model = model_target_sex.make_nearest_centroid_model(X, y, X_val, y_val, baseline_acc = 0.58)
pd.DataFrame([orig_model])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['predicted'] = nc.predict(X_train)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val['predicted'] = nc.predict(X_val)


Unnamed: 0,model,train_accuracy,validate_accuracy,accuracy_change,better_than_baseline
0,Nearest Centroid Classification,0.671429,0.6,0.071429,True


### Using a pipeline on the nearest centroid model

Only a train and test dataset are used; therefore, the original data is re-split to combine the size of the validate and train.  This means that the test data represents 0.15 of the total data.  Although this makes the model comparison between the original model and the new models more subjective, the point of the excercise is to explore model building, not necessarily improve the model.

In [8]:
train, test = train_test_split(df, test_size = 0.15, stratify = df[['sex']], random_state = RANDOM_SEED)

#make the X and y again
X = train[MODEL_COLUMNS]
y = train['sex']
X_test = test[MODEL_COLUMNS]
y_test = test['sex']

pipe_nc = make_pipeline(StandardScaler(), neigh.NearestCentroid())
pipe_nc.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('nearestcentroid', NearestCentroid())])

Note that there is a high drop off in the accuracy.  This indicates high variance/overfitting

In [9]:
y_pred = pipe_nc.predict(X)
print(f'Train accuracy: {pipe_nc.score(X, y)}')
print(f'Test accuracy: {pipe_nc.score(X_test, y_test)}')

Train accuracy: 0.6823529411764706
Test accuracy: 0.4375


### Using random forests to assess feature importance

The nearest centroid classifier cannot be regularized; therefore, use a random forest to extract the most important features in order to attempt to reduce the variance.

In [10]:
feat_labels = df.columns[2:] #extract the model features
#make and fit a random forest
forest = RandomForestClassifier(random_state = RANDOM_SEED)
forest.fit(X, y)
#get feature importance
importance = forest.feature_importances_
indices = np.argsort(importance)[::-1] #argsort gives the indices of the sorted array
for f in range(X.shape[1]):
    print(f'{f} {feat_labels[f]} - {importance[indices[f]]}')

0 total_length - 0.1345654110699876
1 body_length - 0.12851492874873754
2 tail_length - 0.12496915756449375
3 head_length - 0.11225451446670975
4 skull_width - 0.10962931336950105
5 foot_length - 0.09313321532988027
6 eye_width - 0.09058792904149335
7 chest_girth - 0.0807759898900699
8 belly_girth - 0.06436621449659412
9 ear_length - 0.0612033260225326


When fit to data, the model peaks at an accuracy of 0.62 for unseen data using 7 features. This is a significant improvement over the original model accuracy using all of the features.

In [11]:
#get a list of these features
outputs = list()
for i in range(1, indices.shape[0]+1):
    X_feat_extracted = X.iloc[:, [x for x in indices if x < i]]
    X_test_feat_extracted = X_test.iloc[:, [x for x in indices if x < i]]

    pipe_nc.fit(X_feat_extracted, y)
    y_pred = pipe_nc.predict(X_feat_extracted)
    output = {
        'num_features': i,
        'train_accuracy':pipe_nc.score(X_feat_extracted, y),
        'test_accuracy':pipe_nc.score(X_test_feat_extracted, y_test)
    }
    outputs.append(output)
pd.DataFrame(outputs)

Unnamed: 0,num_features,train_accuracy,test_accuracy
0,1,0.6,0.5
1,2,0.564706,0.375
2,3,0.588235,0.375
3,4,0.647059,0.5
4,5,0.6,0.5625
5,6,0.635294,0.5625
6,7,0.694118,0.625
7,8,0.694118,0.625
8,9,0.682353,0.5625
9,10,0.682353,0.4375


## Improving the Logistic Regression model

The previous logistic regression model had an accuracy 0.77.  Assess a basic model using k-fold validation.

There is high variance in the model seen by the drop off in accuracy on unseen data.

In [12]:
df = acquire.make_dataset()
df = model_target_sex.make_modeling_columns(df)
df = df[['case', 'sex'] + MODEL_COLUMNS]

In [14]:
#make a pipeline with logisticregression model
pipe_lr = make_pipeline(StandardScaler(), LogisticRegression())

#make the k-fold
k_fold = StratifiedKFold(n_splits = 10)
outputs = []

In [15]:
for train_idx, test_idx in k_fold.split(df, df.loc[:, 'sex']):
    cols = df.columns.tolist()
    X = cols[2:]
    y = cols[1]
    train = df.iloc[train_idx, :]
    X_train = train.loc[:, X]
    y_train = train.loc[:, y]
    test = df.iloc[test_idx, :]
    X_test = test.loc[:, X]
    y_test = test.loc[:, y]
    pipe_lr.fit(X_train, y_train)
    output = {
        'train_accuracy': pipe_lr.score(X_train, y_train),
        'test_accuracy': pipe_lr.score(X_test, y_test)
    }
    outputs.append(output)
    
pd.DataFrame(outputs)

Unnamed: 0,train_accuracy,test_accuracy
0,0.677778,0.636364
1,0.692308,0.7
2,0.67033,0.6
3,0.692308,0.6
4,0.714286,0.5
5,0.692308,0.6
6,0.703297,0.5
7,0.714286,0.6
8,0.692308,0.7
9,0.714286,0.6


In [None]:
df