# Coffee Data: Classification model training ☕☕☕

🕵🏿‍♀️ To break the problem down we are going to look at predicting if a give coffee sample will have a `total_cup_points` of over 85 given the green and processing data.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from data_prep import handle_na_values, split_data, convert_bag_weight, total_points_over_85
from train import train_logistic_regression, train_decision_tree, train_random_forest, check_feature_importance
from evaluate import validate_model, print_model_evaluation, model_evaluation

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv("../data/merged_data_cleaned.csv",  index_col=0)

In [3]:
df.columns = df.columns.str.lower().str.replace(".", "_")

  df.columns = df.columns.str.lower().str.replace(".", "_")


In [4]:
df = handle_na_values(df)

In [5]:
df["bag_weight"] = df["bag_weight"].apply(lambda weight_str: convert_bag_weight(weight_str))

In [6]:
numerical_features = [
    "moisture",
    "category_one_defects",
    "quakers",
    "category_two_defects",
    "altitude_mean_meters",
    "bag_weight"
]

categorical_features = [
    "color",
    "species",
    "owner",
    "country_of_origin",
    "farm_name",
    "mill",
    "company",
    "region",
    "producer",
    "in_country_partner",
    "harvest_year",
    "owner_1",
    "variety",
    "processing_method"
]

In [7]:
features = numerical_features + categorical_features

In [8]:
df_train, df_val, df_test, y_train, y_val, y_test, df_full_train = split_data(df, features)

length of training set: 536, validation set: 179, test set: 179


In [9]:
# set target y values to 1 if above 85 and 0 if 85 or lower total cup score
y_train = total_points_over_85(y_train)
y_val = total_points_over_85(y_val)
y_test = total_points_over_85(y_test)

### Model training

#### Logistic Regression

In [10]:
dv_lr, model_lr = train_logistic_regression(df_train, y_train)

In [11]:
y_pred_lr = validate_model(df_val, y_val, dv_lr, model_lr)

print_model_evaluation(y_val, y_pred_lr)

Accuracy: 0.95
Roc Auc: 0.55
Rsme: 0.22
Precision: 0.500
F-Measure: 0.182
Recall: 0.111


#### Decision Tree

In [12]:
dv_dt, model_dt = train_decision_tree(df_train, y_train)

In [13]:
y_pred_dt = validate_model(df_val, y_val, dv_dt, model_dt)

print_model_evaluation(y_val, y_pred_dt)

Accuracy: 0.95
Roc Auc: 0.50
Rsme: 0.22
Precision: 0.000
F-Measure: 0.000
Recall: 0.000


  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest

In [14]:
dv_rf, model_rf = train_random_forest(df_train, y_train)

In [15]:
y_pred_rf = validate_model(df_val, y_val, dv_dt, model_dt)

print_model_evaluation(y_val, y_pred_rf)

Accuracy: 0.95
Roc Auc: 0.50
Rsme: 0.22
Precision: 0.000
F-Measure: 0.000
Recall: 0.000


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
feature_set = numerical_features + categorical_features

In [17]:
scores_df = check_feature_importance(feature_set, df_train, y_train, df_val, y_val)
scores_df.sort_values(by=["accuracy", "auc", "rsme", "precision", "f1_score", "recall"], ascending=[0, 0, 1, 0 , 0, 0])

Unnamed: 0,feature_removed,accuracy,auc,rsme,precision,f1_score,recall
0,moisture,0.955307,0.555556,0.211407,1.0,0.2,0.111111
2,quakers,0.955307,0.555556,0.211407,1.0,0.2,0.111111
3,category_two_defects,0.955307,0.555556,0.211407,1.0,0.2,0.111111
7,species,0.955307,0.555556,0.211407,1.0,0.2,0.111111
8,owner,0.955307,0.555556,0.211407,1.0,0.2,0.111111
10,farm_name,0.955307,0.555556,0.211407,1.0,0.2,0.111111
12,company,0.955307,0.555556,0.211407,1.0,0.2,0.111111
13,region,0.955307,0.555556,0.211407,1.0,0.2,0.111111
14,producer,0.955307,0.555556,0.211407,1.0,0.2,0.111111
15,in_country_partner,0.955307,0.555556,0.211407,1.0,0.2,0.111111


In [18]:
features_subset = ["moisture", "quakers", "category_two_defects", "species", "owner", "farm_name", "company", "region", "producer", "in_country_partner", "harvest_year", "owner_1", "variety"]

In [19]:
dv_lr2, model_lr2 = train_logistic_regression(df_train[features_subset], y_train)
y_pred_lr2 = validate_model(df_val, y_val, dv_lr2, model_lr2)

print_model_evaluation(y_val, y_pred_lr2)

Accuracy: 0.95
Roc Auc: 0.55
Rsme: 0.22
Precision: 0.500
F-Measure: 0.182
Recall: 0.111


## Parameter Tuning

In [20]:
import sys
import warnings
import os

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

In [21]:
dv_lr, model_lr = train_logistic_regression(df_train, y_train)

In [22]:
scores = []

for random_state in [2, 10, 32, 42]:
    for c in [0.01, 0.1, 1, 10]:
        for no_iter in [100, 500, 1000, 10000]:
            dv_lr, model_lr = train_logistic_regression(df_train, y_train, c=c, max_iter=no_iter, random_state=random_state)

            y_pred_lr = validate_model(df_val, y_val, dv_lr, model_lr)

            scores.append((random_state, c, no_iter, *model_evaluation(y_val, y_pred_lr)))

cols = ["random_state", "c_value", "max_iter", "accuracy", "auc", "rsme", "precision", "f1_score", "recall"]

parameter_tuning_scores_df = pd.DataFrame(scores, columns=cols)
parameter_tuning_scores_df.sort_values(by=["accuracy", "auc", "rsme", "precision", "f1_score", "recall"], ascending=[0, 0, 1, 0 , 0, 0])

Unnamed: 0,random_state,c_value,max_iter,accuracy,auc,rsme,precision,f1_score,recall
8,2,1.00,100,0.949721,0.552614,0.224231,0.5,0.181818,0.111111
9,2,1.00,500,0.949721,0.552614,0.224231,0.5,0.181818,0.111111
10,2,1.00,1000,0.949721,0.552614,0.224231,0.5,0.181818,0.111111
11,2,1.00,10000,0.949721,0.552614,0.224231,0.5,0.181818,0.111111
12,2,10.00,100,0.949721,0.552614,0.224231,0.5,0.181818,0.111111
...,...,...,...,...,...,...,...,...,...
51,42,0.01,10000,0.949721,0.500000,0.224231,0.0,0.000000,0.000000
52,42,0.10,100,0.949721,0.500000,0.224231,0.0,0.000000,0.000000
53,42,0.10,500,0.949721,0.500000,0.224231,0.0,0.000000,0.000000
54,42,0.10,1000,0.949721,0.500000,0.224231,0.0,0.000000,0.000000


## Check model with test data

Final parameters chosen are c=1, max_iter=100, random_state=2

In [23]:
dv_final, model_final = train_logistic_regression(df_full_train[features_subset], total_points_over_85(df_full_train["total_cup_points"].values), c=1, max_iter=100, random_state=2)

y_pred_final = validate_model(df_test, y_test, dv_final, model_final)

print_model_evaluation(y_val, y_pred_final)

Accuracy: 0.94
Roc Auc: 0.50
Rsme: 0.24
Precision: 0.000
F-Measure: 0.000
Recall: 0.000


This model is compareable with the train set tested against the validation set so it doesn't appear to over fit. A gridsearch looking at different folds (splits) of the training and test set might reveal more.

In [25]:
df_full_train[features_subset].iloc[0]

moisture                                                     0.12
quakers                                                       0.0
category_two_defects                                           15
species                                                   Arabica
owner                                              sanjava coffee
farm_name                                                 various
company                                pt. shriya artha nusantara
region                                               sapan toraja
producer                                                vary farm
in_country_partner      Specialty Coffee Association of Indonesia
harvest_year                                                 2017
owner_1                                            SanJava Coffee
variety                                                  Sulawesi
Name: 583, dtype: object