# Coffee Data: model training ☕☕☕

In [26]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor, export_text

import xgboost as xgb

from data_prep import handle_na_values, split_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv("../data/merged_data_cleaned.csv",  index_col=0)

In [3]:
df.columns = df.columns.str.lower().str.replace(".", "_")

  df.columns = df.columns.str.lower().str.replace(".", "_")


In [4]:
df = handle_na_values(df)

In [42]:
numerical_features = [
    "moisture",
    "category_one_defects",
    "quakers",
    "category_two_defects",
    "altitude_mean_meters"
#     "bag_weight"
]

categorical_features = [
    "color",
    "species",
    "owner",
    "country_of_origin",
    "farm_name",
    "mill",
    "company",
    "region",
    "producer",
    "in_country_partner",
    "harvest_year",
    "owner_1",
    "variety",
    "processing_method"
]

In [43]:
features = numerical_features + categorical_features

In [44]:
df_train, df_val, df_test, y_train, y_val, y_test, df_full_train = split_data(df, features)

length of training set: 536, validation set: 179, test set: 179


### Model training

#### Logistic Regression

In [45]:
def train(df, y_train):
    dicts = df.to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [46]:
dv, model = train(df_train[features], y_train)

ValueError: Unknown label type: 'continuous'

#### Decision Tree

In [47]:
train_dicts = df_train[features].fillna(0).to_dict(orient='records')

In [48]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [49]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=1)

In [50]:
print(export_text(dt, feature_names=dv.get_feature_names()))

|--- category_two_defects <= 13.50
|   |--- value: [82.17]
|--- category_two_defects >  13.50
|   |--- value: [78.52]





In [51]:
dt.predict(df_val)



ValueError: could not convert string to float: 'Green'

#### Random Forest

In [15]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=1)

In [16]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val= dv.transform(val_dicts)

In [17]:
y_pred = rf.predict(X_val)

In [18]:
mean_squared_error(y_val, y_pred, squared=False)

0.027051807848965024

#### Xg Boost

In [32]:
features = dv.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [22]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

In [23]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [24]:
y_pred = model.predict(dval)

In [25]:
rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

0.026126506046486385

### Parameter Tuning