In [1]:
# IMPORT
import numpy as np
import pandas as pd
from scipy.stats import loguniform, randint, uniform
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR, SVC
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from scipy.stats import expon, lognorm, loguniform, randint, uniform, norm
import pickle

In [2]:
# Read train and test data files
train_df = pd.read_csv('../data/processed/train.csv')
test_df = pd.read_csv('../data/processed/test.csv')

In [3]:
X_train, y_train = train_df.drop(columns=['Rating']), train_df['Rating']
X_test, y_test = test_df.drop(columns=['Rating']), test_df['Rating']

In [4]:
# column transform
numeric_features = ['Cocoa_Percent']
categorical_features = ['Company_(Manufacturer)', 'Company_Location', 'Country_of_Bean_Origin']
text_features = 'Most_Memorable_Characteristics'
drop_features = ['REF', 'Review_Date', 'Specific_Bean_Origin_or_Bar_Name', 'Ingredients']

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    (CountVectorizer(), text_features),
    ("drop", drop_features)
)

In [5]:
# dummy model baseline
cross_val_results = {}
dr = DummyRegressor()
cross_val_results['dummy'] = pd.DataFrame(cross_validate(
    dr, X_train, y_train,
    return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['dummy']

Unnamed: 0,mean,std
fit_time,0.001,0.0
score_time,0.0,0.0
test_score,-0.011,0.012
train_score,0.0,0.0


In [6]:
svr_pipe = make_pipeline(preprocessor, SVR())

In [7]:
cross_val_results['svr'] = pd.DataFrame(cross_validate(
    svr_pipe, X_train, y_train,
    return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['svr']

Unnamed: 0,mean,std
fit_time,0.159,0.006
score_time,0.032,0.001
test_score,0.463,0.037
train_score,0.926,0.003


In [8]:
#hyperparameter tuning
preprocessor.fit(X_train, y_train)

len_vocab = len(preprocessor.named_transformers_['countvectorizer'].get_feature_names_out())

param_dist_svr = {'columntransformer__countvectorizer__max_features': randint(100, len_vocab),
            'svr__gamma' : loguniform(1e-5, 1e3),
              'svr__C' : loguniform(1e-3, 1e3),
              'svr__degree': randint(2, 5)          
}

In [9]:
random_search_svr = RandomizedSearchCV(
    svr_pipe, 
    param_dist_svr, 
    n_jobs=-1, 
    n_iter=20,
    random_state=522)

random_search_svr.fit(X_train, y_train)

In [10]:
# best hyperparameters
best_parameters_svr = random_search_svr.best_params_
best_parameters_svr

{'columntransformer__countvectorizer__max_features': 463,
 'svr__C': 279.738059368519,
 'svr__degree': 4,
 'svr__gamma': 0.11980535256275426}

In [11]:
# Report the best score of the svr
best_score_svr = random_search_svr.best_score_
best_score_svr

0.47399913524243187

In [12]:
# Add results to the cross_val_results dictionary

preprocessor_svr_tun = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    (CountVectorizer(max_features=276), text_features),
    ("drop", drop_features)
)

svr_pipe_tun = make_pipeline(preprocessor_svr_tun, SVR(C=279.738,degree=4,gamma=0.119)) 

cross_val_results['svr_tuning'] = pd.DataFrame(cross_validate(
    svr_pipe_tun, X_train, y_train, cv=5,
    return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['svr_tuning']

# Show the results of all models
pd.concat(cross_val_results, axis=1)


Unnamed: 0_level_0,dummy,dummy,svr,svr,svr_tuning,svr_tuning
Unnamed: 0_level_1,mean,std,mean,std,mean,std
fit_time,0.001,0.0,0.159,0.006,0.169,0.007
score_time,0.0,0.0,0.032,0.001,0.032,0.001
test_score,-0.011,0.012,0.463,0.037,0.47,0.039
train_score,0.0,0.0,0.926,0.003,0.96,0.001


In [13]:
svr_pipe_tun.fit(X_train, y_train)
svr_pipe_tun.score(X_test,y_test)

0.42489988723569294

In [14]:
#### Ridge Analysis

ridge_pipe = make_pipeline(preprocessor, Ridge())

In [15]:
cross_val_results['ridge'] = pd.DataFrame(cross_validate(
    ridge_pipe, X_train, y_train,
    return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['ridge']

Unnamed: 0,mean,std
fit_time,0.036,0.009
score_time,0.017,0.018
test_score,0.432,0.017
train_score,0.8,0.004


In [16]:
#hyperparameter tuning for ridge
preprocessor.fit(X_train, y_train)

len_vocab = len(preprocessor.named_transformers_['countvectorizer'].get_feature_names_out())

param_dist_ridge = {'columntransformer__countvectorizer__max_features': randint(100, len_vocab),
            'ridge__alpha': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]
}

In [17]:
random_search_ridge = RandomizedSearchCV(
    ridge_pipe, 
    param_dist_ridge, 
    n_jobs=-1, 
    n_iter=20,
    random_state=522)

random_search_ridge.fit(X_train, y_train)

In [18]:
# best hyperparameters ridge
best_parameters_ridge = random_search_ridge.best_params_
best_parameters_ridge

{'columntransformer__countvectorizer__max_features': 476, 'ridge__alpha': 5}

In [19]:
# Report the best score of ridge
best_score_ridge = random_search_ridge.best_score_
best_score_ridge

0.47849996154479235

In [20]:
# Add results to the cross_val_results dictionary

preprocessor_ridge_tun = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    (CountVectorizer(max_features=476), text_features),
    ("drop", drop_features)
)

ridge_pipe_tun = make_pipeline(preprocessor_ridge_tun, Ridge(alpha=5)) 

cross_val_results['ridge_tuning'] = pd.DataFrame(cross_validate(
    ridge_pipe_tun, X_train, y_train, cv=5,
    return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['ridge_tuning']

# Show the results of all models
pd.concat(cross_val_results, axis=1)

Unnamed: 0_level_0,dummy,dummy,svr,svr,svr_tuning,svr_tuning,ridge,ridge,ridge_tuning,ridge_tuning
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
fit_time,0.001,0.0,0.159,0.006,0.169,0.007,0.036,0.009,0.026,0.005
score_time,0.0,0.0,0.032,0.001,0.032,0.001,0.017,0.018,0.011,0.003
test_score,-0.011,0.012,0.463,0.037,0.47,0.039,0.432,0.017,0.478,0.02
train_score,0.0,0.0,0.926,0.003,0.96,0.001,0.8,0.004,0.699,0.003
