In [1]:
# Imports
from common import display_input_data, check_input_data_indices, train_model, prepare_dataset, evaluate_model, \
    run_input_optimizer, calculate_accuracy, calculate_error_rate
import pandas as pd
import numpy as np
import ipywidgets as widgets
import pickle


In [2]:
# Parameters
# Chose the variables to crate the model
input_cols = ['shape', 'profile_entrance', 'profile_exit', 'rising_entrance', 'rising_exit', 'shards', 'feathering', 'entrance_mounding', 'center_mounding', 'exit_mounding', 'mounding']
result_col = 'result'
train_dataset_file = 'DATA/Train/serration_prediction_train.xlsx'
test_dataset_file = 'DATA/Validation/serration_prediction_validation.xlsx'

In [3]:
# The data are displayed on a table
display_input_data(
    train_dataset_file,
    input_cols
)


Unnamed: 0,shape_E,shape_indeterminate,profile_entrance_V,profile_exit_V,rising_entrance_bilateral,rising_entrance_single,rising_exit_bilateral,rising_exit_single,shards_present,feathering_present,entrance_mounding_bilateral,entrance_mounding_single,center_mounding_bilateral,center_mounding_single,exit_mounding_bilateral,exit_mounding_single,mounding_marked,mounding_not marked
0,False,True,True,True,False,True,False,True,False,False,False,True,False,False,False,True,False,True
1,False,True,False,True,False,True,True,False,False,False,False,True,False,False,False,True,False,True
2,False,True,False,True,False,True,True,False,False,False,False,True,False,False,True,False,False,True
3,False,True,False,True,False,True,True,False,False,False,False,True,False,False,True,False,False,True
4,False,True,False,True,False,True,True,False,False,False,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,True,False,True,True,False,True,False,True,False,False,False,True,False,False,False,True,True,False
276,True,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False
277,True,False,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False
278,True,False,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [4]:
# The two datasets are tested for compatibility 
check_input_data_indices(train_dataset_file, test_dataset_file, input_cols)

Datasets are compatible


In [5]:
# Logistic regression model
model = train_model(
    train_dataset_file,
    input_cols,
    result_col
)

filename = 'models/blade-serration.pickle'
pickle.dump(model, open(filename, 'wb'))

print(f'Saved model to {filename}')

# model
print('b0 = ' + str(model.intercept_))
print('b1...n = ' + str(model.coef_))

Saved model to models/blade-serration.pickle
b0 = [2.94177671]
b1...n = [[-1.21639648 -1.28412923 -0.23924146 -2.3325252  -1.73212203 -1.63566348
  -1.3639484  -0.16607839  1.6963423   1.68430953 -0.56727603 -0.18639544
  -0.30974718 -0.53760101 -0.37406523  1.41657166  0.46015146  0.21973993]]


In [6]:
# Test for overfitting
# Evaluation of the training
x_train, y_train = prepare_dataset(train_dataset_file, input_cols, result_col)
model.score(x_train, y_train)

0.9

In [7]:
# 11 variables were used 
evaluate_model(
    model,
    test_dataset_file,
    input_cols,
    result_col
)

0.7164179104477612

In [8]:
# filter data for the new dataframe
df = pd.read_excel("DATA/Train/serration_prediction_train.xlsx")
x_df = df[input_cols].apply(lambda x: x.str.strip())
x = pd.get_dummies(x_df, drop_first=True)

result = model.predict(x)
print(result)
# this paragraph was created to see if serrated or non serrated corresponded either False or True
# True corresponds to serrated

[False False False False False False False False  True False False False
 False  True False False False False False False False False False False
 False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True  True  True  True  T

In [9]:
# filter data for the new dataframe
df = pd.read_excel("DATA/Validation/serration_prediction_validation.xlsx")
label = df["label"]
string_expresult = df["result"]

x_df = df[input_cols].apply(lambda x: x.str.strip())
x = pd.get_dummies(x_df, drop_first=True)

result = model.predict(x)

print(string_expresult)
print(result)


0     non-serrated
1     non-serrated
2     non-serrated
3     non-serrated
4     non-serrated
          ...     
62    non-serrated
63    non-serrated
64    non-serrated
65    non-serrated
66    non-serrated
Name: result, Length: 67, dtype: object
[ True  True  True False False False False False False False  True  True
  True False  True  True False False False  True  True  True  True  True
 False False False  True  True False False False False False False False
 False False False  True  True  True  True  True  True  True  True  True
  True  True  True False False False  True  True  True False False False
 False False  True  True  True  True False]


In [10]:
# to organize better the things, we have to create a new list for the "result" and transform them in boolean

expected_result = []
for x in string_expresult:

    if x == "serrated":
        expected_result.append(True)
    
    else:
        expected_result.append(False)

print(expected_result)



[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False]


In [11]:
#build dataframe with result, expected result and label 

data = {'label': label, 'result': result, "expected_result" : expected_result}  
  
# Create DataFrame  
df = pd.DataFrame(data)  
   
#print(df)  

grouped_df = df.groupby(by = "label")

#print(grouped_df.head())


In [12]:
five = grouped_df.get_group(5)
six = grouped_df.get_group(6)
seven = grouped_df.get_group(7)

serrated = pd.concat([five, six, seven], ignore_index= True)

print(f'serrated\n{serrated}\n')
serrated_accuracy = calculate_accuracy(serrated)

print( "For serrated class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", serrated_accuracy, "\n")


serrated
    label  result  expected_result
0       5    True             True
1       5    True             True
2       5    True             True
3       5    True             True
4       5    True             True
5       5    True             True
6       5    True             True
7       5    True             True
8       5    True             True
9       6    True             True
10      6    True             True
11      6    True             True
12      6   False             True
13      6   False             True
14      6   False             True
15      6    True             True
16      6    True             True
17      6    True             True
18      7    True             True
19      7    True             True
20      7    True             True
21      7    True             True
22      7   False             True
23      7   False             True
24      7   False             True
25      7    True             True
26      7    True             True

For serrat

In [13]:
for name, sub_df in grouped_df:
    print(f'{name}\n{sub_df}\n')

1
    label  result  expected_result
57      1   False            False
58      1   False            False
59      1   False            False
60      1   False            False
61      1   False            False
62      1    True            False
63      1    True            False
64      1    True            False
65      1    True            False
66      1   False            False

2
   label  result  expected_result
0      2    True            False
1      2    True            False
2      2    True            False
3      2   False            False
4      2   False            False
5      2   False            False
6      2   False            False
7      2   False            False
8      2   False            False
9      2   False            False

3
    label  result  expected_result
29      3   False            False
30      3   False            False
31      3   False            False
32      3   False            False
33      3   False            False
34      3   False      

In [14]:
# calculate accuracy for non_serrated class
one = grouped_df.get_group(1)
two = grouped_df.get_group(2)
three = grouped_df.get_group(3)
four = grouped_df.get_group(4)

non_serrated = pd.concat([one, two, three, four], ignore_index= True)
print(f'non-serrated\n{non_serrated}\n')
non_serrated_accuracy = calculate_accuracy(non_serrated)

print( "For non-serrated class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", non_serrated_accuracy, "\n")


non-serrated
    label  result  expected_result
0       1   False            False
1       1   False            False
2       1   False            False
3       1   False            False
4       1   False            False
5       1    True            False
6       1    True            False
7       1    True            False
8       1    True            False
9       1   False            False
10      2    True            False
11      2    True            False
12      2    True            False
13      2   False            False
14      2   False            False
15      2   False            False
16      2   False            False
17      2   False            False
18      2   False            False
19      2   False            False
20      3   False            False
21      3   False            False
22      3   False            False
23      3   False            False
24      3   False            False
25      3   False            False
26      3   False            False
27     

In [15]:
# calculate accuracy for non_serrated class without knife number 4
one = grouped_df.get_group(1)
two = grouped_df.get_group(2)
three = grouped_df.get_group(3)


non_serrated = pd.concat([one, two, three], ignore_index= True)
print(f'non-serrated\n{non_serrated}\n')
non_serrated_accuracy = calculate_accuracy(non_serrated)

print( "For non-serrated class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", non_serrated_accuracy, "\n")

non-serrated
    label  result  expected_result
0       1   False            False
1       1   False            False
2       1   False            False
3       1   False            False
4       1   False            False
5       1    True            False
6       1    True            False
7       1    True            False
8       1    True            False
9       1   False            False
10      2    True            False
11      2    True            False
12      2    True            False
13      2   False            False
14      2   False            False
15      2   False            False
16      2   False            False
17      2   False            False
18      2   False            False
19      2   False            False
20      3   False            False
21      3   False            False
22      3   False            False
23      3   False            False
24      3   False            False
25      3   False            False
26      3   False            False
27     

In [16]:
#calculate classification accuracy for each knife 
for name, sub_df in grouped_df:
    accuracy_perc = calculate_accuracy(sub_df)
  
    print( "knife:", name) 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
    
    print( "accuracy %:", accuracy_perc, "\n")
       


knife: 1
accuracy %: 60.0 

knife: 2
accuracy %: 70.0 

knife: 3
accuracy %: 100.0 

knife: 4
accuracy %: 40.0 

knife: 5
accuracy %: 100.0 

knife: 6
accuracy %: 66.66666666666666 

knife: 7
accuracy %: 66.66666666666666 



In [17]:
#calcolate error rate for each knife
for name, sub_df in grouped_df:
    error_rate = calculate_error_rate(sub_df)
    
    print( "knife:", name)
    print( "error rate %:", error_rate, "\n")

knife: 1
error rate %: 40.0 

knife: 2
error rate %: 30.0 

knife: 3
error rate %: 0.0 

knife: 4
error rate %: 60.0 

knife: 5
error rate %: 0.0 

knife: 6
error rate %: 33.33333333333334 

knife: 7
error rate %: 33.33333333333334 



In [18]:
# Model optimisation, search for the optimal parameter combination
# Printing score best and worst model
optimizer_results = run_input_optimizer(train_dataset_file, test_dataset_file, input_cols, result_col)

print('Max score: ' + str(optimizer_results['max_score']))
print('Best columns combination: ' + str(optimizer_results['max_score_cols']))
print('Min score: ' + str(optimizer_results['min_score']))
print('Worse columns combination: ' + str(optimizer_results['min_score_cols']))

# Convert from tuple to list for easier use later
optimized_input_cols = list(optimizer_results['max_score_cols'])
optimized_model = optimizer_results['max_score_model']
filename = 'models/blade-serration-optimized.pickle'
pickle.dump(optimized_model, open(filename, 'wb'))

print(f'Saved optimized model to {filename}')

Testing combinations of length 2
Testing combinations of length 3
Testing combinations of length 4
Testing combinations of length 5
Testing combinations of length 6
Testing combinations of length 7
Testing combinations of length 8
Testing combinations of length 9
Testing combinations of length 10
Testing combinations of length 11
Max score: 0.8805970149253731
Best columns combination: ('shape', 'profile_entrance', 'rising_entrance', 'shards', 'mounding')
Min score: 0.4626865671641791
Worse columns combination: ('feathering', 'entrance_mounding')
Saved optimized model to models/blade-serration-optimized.pickle


In [19]:
# filter data for the new dataframe
df = pd.read_excel("DATA/Validation/serration_prediction_validation.xlsx")
label = df["label"]
# Inline conversion from str to bool
expected_result = df["result"].map(lambda exp_res: True if exp_res == 'serrated' else False)

x_df = df[optimized_input_cols].apply(lambda x: x.str.strip())
x = pd.get_dummies(x_df, drop_first=True)

result = optimized_model.predict(x)

print(f'Expected result:\n{expected_result}')
print(f'Actual result:\n{result}')

Expected result:
0     False
1     False
2     False
3     False
4     False
      ...  
62    False
63    False
64    False
65    False
66    False
Name: result, Length: 67, dtype: bool
Actual result:
[False  True  True False  True False False  True False False  True False
 False False False False False False False False  True  True  True  True
  True  True  True  True False False False False False False False False
 False False False  True  True  True  True  True  True  True  True  True
  True  True  True False  True  True  True  True  True False False  True
 False False False False False False False]


In [20]:
#build dataframe with result, expected result and label 

data = {'label': label, 'result': result, "expected_result" : expected_result}  
  
# Create DataFrame  
df = pd.DataFrame(data)  
   
print(df)  

grouped_df = df.groupby(by = "label")

print(grouped_df.head())

    label  result  expected_result
0       2   False            False
1       2    True            False
2       2    True            False
3       2   False            False
4       2    True            False
..    ...     ...              ...
62      1   False            False
63      1   False            False
64      1   False            False
65      1   False            False
66      1   False            False

[67 rows x 3 columns]
    label  result  expected_result
0       2   False            False
1       2    True            False
2       2    True            False
3       2   False            False
4       2    True            False
10      4    True            False
11      4   False            False
12      4   False            False
13      4   False            False
14      4   False            False
20      7    True             True
21      7    True             True
22      7    True             True
23      7    True             True
24      7    True             Tr

In [21]:
# Optimized model: calculate the accuracy for the four knives that are serrated 

five = grouped_df.get_group(5)
six = grouped_df.get_group(6)
seven = grouped_df.get_group(7)

serrated = pd.concat([five, six, seven], ignore_index= True)
print(f'serrated\n{serrated}\n')
serrated_accuracy = calculate_accuracy(serrated)

print( "For serrated class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", serrated_accuracy, "\n")

serrated
    label  result  expected_result
0       5    True             True
1       5    True             True
2       5    True             True
3       5    True             True
4       5    True             True
5       5    True             True
6       5    True             True
7       5    True             True
8       5    True             True
9       6    True             True
10      6    True             True
11      6    True             True
12      6   False             True
13      6    True             True
14      6    True             True
15      6    True             True
16      6    True             True
17      6    True             True
18      7    True             True
19      7    True             True
20      7    True             True
21      7    True             True
22      7    True             True
23      7    True             True
24      7    True             True
25      7    True             True
26      7   False             True

For serrat

In [22]:
# Optimized model: calculate the accuracy for the four knives that are non-serrated 

one = grouped_df.get_group(1)
two = grouped_df.get_group(2)
three = grouped_df.get_group(3)
four = grouped_df.get_group(4)

non_serrated = pd.concat([one, two, three, four], ignore_index= True)
print(f'non-serrated\n{non_serrated}\n')
non_serrated_accuracy = calculate_accuracy(non_serrated)

print( "For non-serrated class") 
    #print("total correct:", tot_corr)
    #print('total:', tot) 
    
print("accuracy %:", non_serrated_accuracy, "\n")

non-serrated
    label  result  expected_result
0       1   False            False
1       1   False            False
2       1    True            False
3       1   False            False
4       1   False            False
5       1   False            False
6       1   False            False
7       1   False            False
8       1   False            False
9       1   False            False
10      2   False            False
11      2    True            False
12      2    True            False
13      2   False            False
14      2    True            False
15      2   False            False
16      2   False            False
17      2    True            False
18      2   False            False
19      2   False            False
20      3   False            False
21      3   False            False
22      3   False            False
23      3   False            False
24      3   False            False
25      3   False            False
26      3   False            False
27     

In [23]:
for name, sub_df in grouped_df:
    accuracy_perc = calculate_accuracy(sub_df)
    error_rate_perc = calculate_error_rate(sub_df)
    
    print( "knife:", name) 
    print( "accuracy %:", accuracy_perc)
    print( "error rate %:", error_rate_perc, "\n")

knife: 1
accuracy %: 90.0
error rate %: 10.0 

knife: 2
accuracy %: 60.0
error rate %: 40.0 

knife: 3
accuracy %: 100.0
error rate %: 0.0 

knife: 4
accuracy %: 90.0
error rate %: 10.0 

knife: 5
accuracy %: 100.0
error rate %: 0.0 

knife: 6
accuracy %: 88.88888888888889
error rate %: 11.111111111111114 

knife: 7
accuracy %: 88.88888888888889
error rate %: 11.111111111111114 



In [24]:
# tex = "$$p = {\frac{1}{1 + e ^ {-(\beta_0 + \beta_1X_i + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5)}}}$$"
# tex = "$$p = {\frac{1}{1 + e ^ {-(3 + 2X_1 + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5)}}}$$"
tex = r"$$p = {\frac{1}{1 + e ^ {-("

# Add intercept
tex += str(np.round(model.intercept_[0], decimals=2))

 
# Add coefficients
for i, b in enumerate(model.coef_[0]):
    if b < 0:
        sign = '-'
    elif b > 0:
        sign = '+'
    else:
        continue
    tex += f"{sign}{str(np.abs(np.round(b, decimals=2)))}x_{{{str(i+1)}}}"

tex += ")}}}$$"

widgets.HTMLMath(
    value=tex,
    placeholder='Logistic regression',
)

HTMLMath(value='$$p = {\\frac{1}{1 + e ^ {-(2.94-1.22x_{1}-1.28x_{2}-0.24x_{3}-2.33x_{4}-1.73x_{5}-1.64x_{6}-1…