# Coffee Data: model training ☕☕☕

In [22]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_text

%matplotlib inline

In [8]:
df = pd.read_csv("data/merged_data_cleaned.csv",  index_col=0)

In [9]:
df.columns = df.columns.str.lower().str.replace('.', '_')

  df.columns = df.columns.str.lower().str.replace('.', '_')


In [10]:
df["altitude_low_meters"].fillna(df["altitude_low_meters"].mean(), inplace=True)
df["altitude_high_meters"].fillna(df["altitude_high_meters"].mean(), inplace=True)
df["altitude_mean_meters"].fillna(df["altitude_mean_meters"].mean(), inplace=True)


df["lot_number"].fillna("missing", inplace=True)
df["farm_name"].fillna("missing", inplace=True)
df["mill"].fillna("missing", inplace=True)
df["owner"].fillna("missing", inplace=True)
df["company"].fillna("missing", inplace=True)
df["producer"].fillna("missing", inplace=True)
df["ico_number"].fillna("missing", inplace=True)

df = df.dropna()

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_val  = train_test_split(df_full_train, test_size=0.25, random_state=1)

len(df_train), len(df_val), len(df_test)

(536, 179, 179)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [15]:
y_train = np.log1p(df_train["total_cup_points"])
y_val = np.log1p(df_val["total_cup_points"])
y_test = np.log1p(df_test["total_cup_points"])

In [16]:
del df_train["total_cup_points"]
del df_val["total_cup_points"]
del df_test["total_cup_points"]

In [18]:
green_analysis = ["moisture",
       "category_one_defects", "quakers", "color", "category_two_defects"]

In [17]:
processing_features = ["species", "owner", "country_of_origin", "farm_name", "lot_number",
       "mill", "ico_number", "company", "altitude", "altitude_mean_meters", "region", "producer",
       "number_of_bags", "bag_weight", "in_country_partner",
       "harvest_year", "grading_date", "owner_1", "variety",
       "processing_method"]

In [19]:
features = green_analysis + processing_features

### Model training

In [20]:
train_dicts = df_train[features].fillna(0).to_dict(orient='records')

In [23]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [24]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=1)

In [25]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=1)

In [26]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val= dv.transform(val_dicts)

In [27]:
y_pred = rf.predict(X_val)

In [28]:
mean_squared_error(y_val, y_pred, squared=False)

0.027051807848965062

In [29]:
!pip install XGBoost

Collecting XGBoost
  Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
     |████████████████████████████████| 173.5 MB 36 kB/s              
Installing collected packages: XGBoost
Successfully installed XGBoost-1.5.0


In [30]:
import xgboost as xgb

In [31]:
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)



In [32]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

In [33]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [34]:
y_pred = model.predict(dval)

In [35]:
rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

0.026126506046486385

In [36]:
import pickle

In [38]:
output_file = f'model_v1.bin'

In [39]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out) 