In [2]:
from common import display_input_data, check_input_data_indices, train_model, prepare_dataset, evaluate_model, \
    run_input_optimizer, calculate_accuracy, calculate_error_rate
import pandas as pd
import ipywidgets as widgets
import numpy as np
import statsmodels
import sklearn


In [3]:
# Parameters
# Chose variables to create the model
input_cols = ['shape', 'profile_entrance', 'profile_exit', 'rising_entrance', 'rising_exit', 'shards', 'feathering', 'entrance_mounding', 'center_mounding', 'exit_mounding', 'mounding']
result_col = 'result'
# Load the train and the validation datasets for blade class prediction
train_dataset_file = '../DATA/Train/blade_prediction_train.xlsx'
test_dataset_file = '../DATA/Validation/blade_prediction_validation.xlsx'

In [4]:
check_input_data_indices(train_dataset_file, test_dataset_file, input_cols)




Datasets are compatible


In [11]:
df = pd.read_excel(train_dataset_file)

#drop the last column 
df = df.iloc[:,:-1]
display(df)

Unnamed: 0,shape,profile_entrance,profile_exit,rising_entrance,rising_exit,shards,feathering,entrance_mounding,center_mounding,exit_mounding,mounding
0,indeterminate,V,V,single,single,absent,absent,single,absent,single,not marked
1,indeterminate,I/,V,single,bilateral,absent,absent,single,absent,single,not marked
2,indeterminate,I/,V,single,bilateral,absent,absent,single,absent,bilateral,not marked
3,indeterminate,I/,V,single,bilateral,absent,absent,single,absent,bilateral,not marked
4,indeterminate,I/,V,single,bilateral,absent,absent,single,absent,bilateral,not marked
...,...,...,...,...,...,...,...,...,...,...,...
275,E,V,V,single,single,absent,absent,single,absent,single,marked
276,E,V,I/,bilateral,bilateral,absent,absent,absent,absent,absent,absent
277,E,V,V,absent,single,absent,absent,absent,absent,absent,absent
278,E,V,V,absent,absent,absent,present,absent,absent,absent,absent


In [13]:
#label encoding for all categorical variables in dataset
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['shape'] = label_encoder.fit_transform(df['shape'])
df['profile_entrance'] = label_encoder.fit_transform(df['profile_entrance'])
df['profile_exit'] = label_encoder.fit_transform(df['profile_exit'])
df['rising_entrance'] = label_encoder.fit_transform(df['rising_entrance'])
df['rising_exit'] = label_encoder.fit_transform(df['rising_exit'])
df['shards'] = label_encoder.fit_transform(df['shards'])
df['feathering'] = label_encoder.fit_transform(df['feathering'])
df['entrance_mounding'] = label_encoder.fit_transform(df['entrance_mounding'])
df['center_mounding'] = label_encoder.fit_transform(df['center_mounding'])
df['exit_mounding'] = label_encoder.fit_transform(df['exit_mounding'])
df['mounding'] = label_encoder.fit_transform(df['mounding'])
#df['result'] = label_encoder.fit_transform(df['result'])
display(df)


Unnamed: 0,shape,profile_entrance,profile_exit,rising_entrance,rising_exit,shards,feathering,entrance_mounding,center_mounding,exit_mounding,mounding
0,2,1,1,2,2,0,0,2,0,2,4
1,2,0,1,2,1,0,0,2,0,2,5
2,2,0,1,2,1,0,0,2,0,1,5
3,2,0,1,2,1,0,0,2,0,1,5
4,2,0,1,2,1,0,0,2,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...
275,1,1,1,2,2,0,0,2,0,2,3
276,1,1,0,1,1,0,0,0,0,0,0
277,1,1,1,0,2,0,0,0,0,0,0
278,1,1,1,0,0,0,2,0,0,0,0


In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_scores = pd.DataFrame() 
vif_scores["Attribute"] = df.columns 

vif_scores["VIF Scores"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))] 
display(vif_scores)




Unnamed: 0,Attribute,VIF Scores
0,shape,3.280161
1,profile_entrance,3.765827
2,profile_exit,6.178339
3,rising_entrance,4.010628
4,rising_exit,3.723072
5,shards,1.305176
6,feathering,1.309801
7,entrance_mounding,3.17641
8,center_mounding,2.717014
9,exit_mounding,4.770127


In [8]:
# use the optimized model variables and see the correlation between them
# first is serration shape, cross profile entrance, rising entrance, shards, and mounding


df2 = df.drop(['profile_exit','result', 'rising_exit','feathering', 'entrance_mounding', 'center_mounding', 'exit_mounding'], axis=1)
display(df2)

vif_scores_optimized_serration = pd.DataFrame() 
vif_scores_optimized_serration["Attribute"] = df2.columns 

vif_scores_optimized_serration["VIF Scores"] = [variance_inflation_factor(df2.values, i) for i in range(len(df2.columns))] 
display(vif_scores_optimized_serration)

Unnamed: 0,shape,profile_entrance,rising_entrance,shards,mounding
0,2,1,2,0,4
1,2,0,2,0,5
2,2,0,2,0,5
3,2,0,2,0,5
4,2,0,2,0,5
...,...,...,...,...,...
275,1,1,2,0,3
276,1,1,1,0,0
277,1,1,0,0,0
278,1,1,0,0,0


Unnamed: 0,Attribute,VIF Scores
0,shape,3.104964
1,profile_entrance,3.079963
2,rising_entrance,2.876348
3,shards,1.080061
4,mounding,3.062435


In [9]:
df3 = df.drop(['profile_exit','result','mounding'], axis=1)
display(df2)

vif_scores_dropped = pd.DataFrame() 
vif_scores_dropped["Attribute"] = df3.columns 

vif_scores_dropped["VIF Scores"] = [variance_inflation_factor(df3.values, i) for i in range(len(df3.columns))] 
display(vif_scores_dropped)

Unnamed: 0,shape,profile_entrance,rising_entrance,shards,mounding
0,2,1,2,0,4
1,2,0,2,0,5
2,2,0,2,0,5
3,2,0,2,0,5
4,2,0,2,0,5
...,...,...,...,...,...
275,1,1,2,0,3
276,1,1,1,0,0
277,1,1,0,0,0
278,1,1,0,0,0


Unnamed: 0,Attribute,VIF Scores
0,shape,3.042916
1,profile_entrance,3.174386
2,rising_entrance,3.614019
3,rising_exit,3.626699
4,shards,1.304812
5,feathering,1.308314
6,entrance_mounding,2.964863
7,center_mounding,2.712231
8,exit_mounding,3.834949


In [10]:
df4 = df.drop(['result','profile_exit'], axis=1)
display(df4)

vif_scores_dropped = pd.DataFrame() 
vif_scores_dropped["Attribute"] = df4.columns 

vif_scores_dropped["VIF Scores"] = [variance_inflation_factor(df4.values, i) for i in range(len(df4.columns))] 
display(vif_scores_dropped)

Unnamed: 0,shape,profile_entrance,rising_entrance,rising_exit,shards,feathering,entrance_mounding,center_mounding,exit_mounding,mounding
0,2,1,2,2,0,0,2,0,2,4
1,2,0,2,1,0,0,2,0,2,5
2,2,0,2,1,0,0,2,0,1,5
3,2,0,2,1,0,0,2,0,1,5
4,2,0,2,1,0,0,2,0,1,5
...,...,...,...,...,...,...,...,...,...,...
275,1,1,2,2,0,0,2,0,2,3
276,1,1,1,1,0,0,0,0,0,0
277,1,1,0,2,0,0,0,0,0,0
278,1,1,0,0,0,2,0,0,0,0


Unnamed: 0,Attribute,VIF Scores
0,shape,3.222832
1,profile_entrance,3.174405
2,rising_entrance,3.890277
3,rising_exit,3.634886
4,shards,1.304834
5,feathering,1.309663
6,entrance_mounding,3.174283
7,center_mounding,2.71448
8,exit_mounding,4.682941
9,mounding,5.044733
