In [1]:
# Imports
from common import display_input_data, check_input_data_indices, train_model, prepare_dataset, evaluate_model, \
    run_input_optimizer
import pandas as pd
import ipywidgets as widgets
import numpy as np
import pickle


In [2]:
# Parameters
# Chose variables to create the model
input_cols = ['shape', 'profile_entrance', 'profile_exit', 'rising_entrance', 'rising_exit', 'shards', 'feathering', 'entrance_mounding', 'center_mounding', 'exit_mounding']
result_col = 'result'
# Load the train and the validation datasets for blade class prediction
train_dataset_file = 'DATA/Train/blade_prediction_train.xlsx'
test_dataset_file = 'DATA/Validation/blade_prediction_validation.xlsx'

In [3]:
# The data are displayed on a table
display_input_data(
    train_dataset_file,
    input_cols
)

Unnamed: 0,shape_E,shape_indeterminate,profile_entrance_V,profile_exit_V,rising_entrance_bilateral,rising_entrance_single,rising_exit_bilateral,rising_exit_single,shards_present,feathering_present,entrance_mounding_bilateral,entrance_mounding_single,center_mounding_bilateral,center_mounding_single,exit_mounding_bilateral,exit_mounding_single
0,False,True,True,True,False,True,False,True,False,False,False,True,False,False,False,True
1,False,True,False,True,False,True,True,False,False,False,False,True,False,False,False,True
2,False,True,False,True,False,True,True,False,False,False,False,True,False,False,True,False
3,False,True,False,True,False,True,True,False,False,False,False,True,False,False,True,False
4,False,True,False,True,False,True,True,False,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,True,False,True,True,False,True,False,True,False,False,False,True,False,False,False,True
276,True,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False
277,True,False,True,True,False,False,False,True,False,False,False,False,False,False,False,False
278,True,False,True,True,False,False,False,False,False,True,False,False,False,False,False,False


In [4]:
# The two datasets are tested for compatibility 
check_input_data_indices(train_dataset_file, test_dataset_file, input_cols)

Datasets are compatible


In [5]:
# Logistic regression model
model = train_model(
    train_dataset_file,
    input_cols,
    result_col
)

filename = 'models/blade-edge.pickle'
pickle.dump(model, open(filename, 'wb'))

# model
print('b0 = ' + str(model.intercept_))
print('b1...n = ' + str(model.coef_))

b0 = [3.72368196]
b1...n = [[-1.8466929  -1.21560843 -1.48451218 -0.39524972 -0.14299724 -0.14135143
  -0.31927317 -0.03249396  0.86234315  0.25851444 -0.98240666  0.06675378
  -1.09210982 -0.6990357  -0.39046999  0.70289077]]


In [6]:
# Test for overfitting
# Evaluation of the training
x_train, y_train = prepare_dataset(train_dataset_file, input_cols, result_col)
model.score(x_train, y_train)

0.8321428571428572

In [7]:
# 10 variables were used 
evaluate_model(
    model,
    test_dataset_file,
    input_cols,
    result_col
)

0.7611940298507462

In [8]:
# filter data for the new dataframe
df = pd.read_excel("DATA/Train/blade_prediction_train.xlsx")
x_df = df[input_cols].apply(lambda x: x.str.strip())
x = pd.get_dummies(x_df, drop_first=True)

result = model.predict(x)
print(result)
# this paragraph was created to see if single or even bladed corresponded either False or True
# True corresponds to single

[ True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False False False False False False  True  True
 False False False  True  True False  True  True  True  True  True False
 False False  True False  True  True False  True  True  True  True False
 False False False False False  True  True  True False False  True False
  True False False False False False False False False False False False
 False False  True False False False  True False False False False False
 False  True False False  True False  True  True False False  True False
  True False False False False False False False  True  True False False
  True  True False  True False False False False False  True False False
 False False False  True False  True False False False False False  True
 False False False  True  True  True  True  True  T

In [9]:
# filter data for the new dataframe
df = pd.read_excel("DATA/Validation/blade_prediction_validation.xlsx")
label = df["label"]
string_expresult = df["result"]

x_df = df[input_cols].apply(lambda x: x.str.strip())
x = pd.get_dummies(x_df, drop_first=True)

result = model.predict(x)

print(string_expresult)
print(result)

0     single
1     single
2     single
3     single
4     single
       ...  
62    single
63    single
64    single
65    single
66    single
Name: result, Length: 67, dtype: object
[ True  True  True False  True False False  True  True False  True  True
  True  True  True  True False  True False  True  True  True  True  True
  True  True  True  True False False False False False False False False
 False False False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
 False  True  True  True  True False  True]


In [10]:
# to organize better the things, we have to create a new list for the "result" and transform them in boolean
expected_result = []
for x in string_expresult:

    if x == "single":
        expected_result.append(True)
    
    else:
        expected_result.append(False)

print(expected_result)


[True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [11]:
#build dataframe with result, expected result and label 

data = {'label': label, 'result': result, "expected_result" : expected_result}  
  
# Create DataFrame  
df = pd.DataFrame(data)  
   
print(df)  

grouped_df = df.groupby(by = "label")

print(grouped_df.head())

    label  result  expected_result
0       2    True             True
1       2    True             True
2       2    True             True
3       2   False             True
4       2    True             True
..    ...     ...              ...
62      1    True             True
63      1    True             True
64      1    True             True
65      1   False             True
66      1    True             True

[67 rows x 3 columns]
    label  result  expected_result
0       2    True             True
1       2    True             True
2       2    True             True
3       2   False             True
4       2    True             True
10      4    True            False
11      4    True            False
12      4    True            False
13      4    True            False
14      4    True            False
20      7    True             True
21      7    True             True
22      7    True             True
23      7    True             True
24      7    True             Tr

In [12]:
def calculate_accuracy(df):

    tot_corr = 0
    for index, row in df.iterrows():
        if row["result"] == row["expected_result"]:
            tot_corr += 1
    tot = len(df)

    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
    accuracy_perc = (tot_corr / tot ) * 100
    return accuracy_perc

In [13]:

# calculate the accuracy for the four knives that are single blade 

one = grouped_df.get_group(1)
five = grouped_df.get_group(5)
six = grouped_df.get_group(6)
seven = grouped_df.get_group(7)

single_bladed = pd.concat([one, five, six, seven], ignore_index= True)
print(f'single bladed\n{single_bladed}\n')
single_bladed_accuracy = calculate_accuracy(single_bladed)

print( "For single blade class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", single_bladed_accuracy, "\n")

single bladed
    label  result  expected_result
0       1    True             True
1       1   False             True
2       1    True             True
3       1   False             True
4       1    True             True
5       1    True             True
6       1    True             True
7       1    True             True
8       1   False             True
9       1    True             True
10      5    True             True
11      5    True             True
12      5    True             True
13      5    True             True
14      5    True             True
15      5    True             True
16      5    True             True
17      5    True             True
18      5    True             True
19      6    True             True
20      6    True             True
21      6    True             True
22      6    True             True
23      6    True             True
24      6    True             True
25      6    True             True
26      6    True             True
27    

In [14]:
two = grouped_df.get_group(2)
three = grouped_df.get_group(3)
four = grouped_df.get_group(4)

even_bladed = pd.concat([two, three, four], ignore_index= True)
print(f'even bladed\n{even_bladed}\n')
even_bladed_accuracy = calculate_accuracy(even_bladed)

print( "For even blade class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", even_bladed_accuracy, "\n")

even bladed
    label  result  expected_result
0       2    True             True
1       2    True             True
2       2    True             True
3       2   False             True
4       2    True             True
5       2   False             True
6       2   False             True
7       2    True             True
8       2    True             True
9       2   False             True
10      3   False            False
11      3   False            False
12      3   False            False
13      3   False            False
14      3   False            False
15      3   False            False
16      3   False            False
17      3   False            False
18      3   False            False
19      3   False            False
20      4    True            False
21      4    True            False
22      4    True            False
23      4    True            False
24      4    True            False
25      4    True            False
26      4   False            False
27      

In [15]:
# accuracy for even bladed without the knife number 4

two = grouped_df.get_group(2)
three = grouped_df.get_group(3)


even_bladed = pd.concat([two, three], ignore_index= True)
print(f'even bladed\n{even_bladed}\n')
even_bladed_accuracy = calculate_accuracy(even_bladed)

print( "For even blade class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", even_bladed_accuracy, "\n")

even bladed
    label  result  expected_result
0       2    True             True
1       2    True             True
2       2    True             True
3       2   False             True
4       2    True             True
5       2   False             True
6       2   False             True
7       2    True             True
8       2    True             True
9       2   False             True
10      3   False            False
11      3   False            False
12      3   False            False
13      3   False            False
14      3   False            False
15      3   False            False
16      3   False            False
17      3   False            False
18      3   False            False
19      3   False            False

For even blade class
accuracy %: 80.0 


In [16]:
for name, sub_df in grouped_df:
    print(f'{name}\n{sub_df}\n')

1
    label  result  expected_result
57      1    True             True
58      1   False             True
59      1    True             True
60      1   False             True
61      1    True             True
62      1    True             True
63      1    True             True
64      1    True             True
65      1   False             True
66      1    True             True

2
   label  result  expected_result
0      2    True             True
1      2    True             True
2      2    True             True
3      2   False             True
4      2    True             True
5      2   False             True
6      2   False             True
7      2    True             True
8      2    True             True
9      2   False             True

3
    label  result  expected_result
29      3   False            False
30      3   False            False
31      3   False            False
32      3   False            False
33      3   False            False
34      3   False      

In [17]:
for name, sub_df in grouped_df:
    tot_corr = 0
    for index, row in sub_df.iterrows():
        if row["result"] == row["expected_result"]:
            tot_corr += 1
    tot = len(sub_df)
    print( "knife:", name) 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
    accuracy_perc = (tot_corr / tot ) * 100
    
    print( "accuracy %:", accuracy_perc, "\n")

knife: 1
accuracy %: 70.0 

knife: 2
accuracy %: 60.0 

knife: 3
accuracy %: 100.0 

knife: 4
accuracy %: 20.0 

knife: 5
accuracy %: 100.0 

knife: 6
accuracy %: 100.0 

knife: 7
accuracy %: 88.88888888888889 


In [18]:
for name, sub_df in grouped_df:
    tot_incorr = 0
    for index, row in sub_df.iterrows():
        if row["result"] != row["expected_result"]:
            tot_incorr += 1
    tot = len(sub_df)
    print( "knife:", name) 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
    error_rates = (tot_incorr / tot ) * 100
    
    print( "error rate %:", error_rates, "\n")

knife: 1
error rate %: 30.0 

knife: 2
error rate %: 40.0 

knife: 3
error rate %: 0.0 

knife: 4
error rate %: 80.0 

knife: 5
error rate %: 0.0 

knife: 6
error rate %: 0.0 

knife: 7
error rate %: 11.11111111111111 


In [19]:
# Model optimisation, search for the optimal parameter combination
# Printing score best and worst model
optimizer_results = run_input_optimizer(train_dataset_file, test_dataset_file, input_cols, result_col)

print('Max score: ' + str(optimizer_results['max_score']))
print('Best columns combination: ' + str(optimizer_results['max_score_cols']))
print('Min score: ' + str(optimizer_results['min_score']))
print('Worse columns combination: ' + str(optimizer_results['min_score_cols']))

Testing combinations of length 2
Testing combinations of length 3
Testing combinations of length 4
Testing combinations of length 5
Testing combinations of length 6
Testing combinations of length 7
Testing combinations of length 8
Testing combinations of length 9
Testing combinations of length 10
Max score: 0.8805970149253731
Best columns combination: ('shape', 'profile_exit', 'rising_entrance', 'shards', 'exit_mounding')
Min score: 0.6865671641791045
Worse columns combination: ('feathering', 'exit_mounding')


In [20]:
# tex = "$$p = {\frac{1}{1 + e ^ {-(\beta_0 + \beta_1X_i + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5)}}}$$"
# tex = "$$p = {\frac{1}{1 + e ^ {-(3 + 2X_1 + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5)}}}$$"
tex = r"$$p = {\frac{1}{1 + e ^ {-("

# Add intercept
tex += str(np.round(model.intercept_[0], decimals=2))

 
# Add coefficients
for i, b in enumerate(model.coef_[0]):
    if b < 0:
        sign = '-'
    elif b > 0:
        sign = '+'
    else:
        continue
    tex += f"{sign}{str(np.abs(np.round(b, decimals=2)))}x_{{{str(i+1)}}}"

tex += ")}}}$$"

widgets.HTMLMath(
    value=tex,
    placeholder='Logistic regression',
)

HTMLMath(value='$$p = {\\frac{1}{1 + e ^ {-(3.72-1.85x_{1}-1.22x_{2}-1.48x_{3}-0.4x_{4}-0.14x_{5}-0.14x_{6}-0.…