<a href="https://colab.research.google.com/github/rynedaniels/project_2/blob/main/Project2Part4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Import Packages/Load Data**##

In [1]:
# import the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, \
ConfusionMatrixDisplay, classification_report
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# load the dataset
df = pd.read_csv('/content/drive/MyDrive/my_files/spanish_wines.csv')
df.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


##**Data Cleaning**##

In [4]:
# previewing our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7498 non-null   object 
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6955 non-null   object 
 9   body         6331 non-null   float64
 10  acidity      6331 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 644.7+ KB


In [5]:
# drop duplicates
df = df.drop_duplicates()

In [6]:
# dropping the 'wine' column due to high cardinality
# dropping the 'country' column as all the values are the same
# also dropping the rows where 'type' is empty as this is a critical feature in our regression analytics
df = df.drop(columns=['country', 'wine'])
df = df.dropna(subset=['type'])

In [7]:
# handling error
df['year'] = df['year'].replace('N.V.', np.nan)
df.dropna(subset=['year'], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1877 entries, 0 to 6100
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       1877 non-null   object 
 1   year         1877 non-null   object 
 2   rating       1877 non-null   float64
 3   num_reviews  1877 non-null   int64  
 4   region       1877 non-null   object 
 5   price        1877 non-null   float64
 6   type         1877 non-null   object 
 7   body         1714 non-null   float64
 8   acidity      1714 non-null   float64
dtypes: float64(4), int64(1), object(4)
memory usage: 146.6+ KB


In [9]:
df['year'] = df['year'].astype(int)

###**Analyzing our Numerical and Categorical columns**###

In [10]:
# previewing numerical columns
df.describe()

Unnamed: 0,year,rating,num_reviews,price,body,acidity
count,1877.0,1877.0,1877.0,1877.0,1714.0,1714.0
mean,2010.869473,4.404688,579.998934,142.612586,4.263127,2.928238
std,11.300344,0.148552,1181.990378,282.560181,0.660949,0.313328
min,1910.0,4.2,25.0,6.26,2.0,1.0
25%,2010.0,4.3,60.0,32.55,4.0,3.0
50%,2015.0,4.4,151.0,55.14,4.0,3.0
75%,2017.0,4.5,513.0,117.3,5.0,3.0
max,2021.0,4.9,16505.0,3119.08,5.0,3.0


In [11]:
# previewing numerical columns
df.describe(exclude='number')

Unnamed: 0,winery,region,type
count,1877,1877,1877
unique,410,65,21
top,Vega Sicilia,Ribera del Duero,Ribera Del Duero Red
freq,96,538,534


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1877 entries, 0 to 6100
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       1877 non-null   object 
 1   year         1877 non-null   int64  
 2   rating       1877 non-null   float64
 3   num_reviews  1877 non-null   int64  
 4   region       1877 non-null   object 
 5   price        1877 non-null   float64
 6   type         1877 non-null   object 
 7   body         1714 non-null   float64
 8   acidity      1714 non-null   float64
dtypes: float64(4), int64(2), object(3)
memory usage: 146.6+ KB


###**Feature Engineering**###

In [13]:
# the results of the cell below shows the percentages of each vintage in our dataset
year_full = df['year'].value_counts(normalize=True).reset_index()


with pd.option_context('display.max_rows', None):
    print(year_full)

    index      year
0    2016  0.125733
1    2017  0.118807
2    2015  0.106553
3    2018  0.093767
4    2014  0.083644
5    2011  0.051678
6    2012  0.049014
7    2019  0.048482
8    2010  0.039425
9    2013  0.034630
10   2009  0.024507
11   2007  0.020245
12   2004  0.019180
13   2005  0.019180
14   2006  0.015983
15   2008  0.015983
16   2001  0.011188
17   2020  0.009590
18   2000  0.009057
19   2003  0.006926
20   1994  0.006926
21   1995  0.006926
22   1998  0.006393
23   2002  0.006393
24   1996  0.005860
25   1999  0.005328
26   1982  0.004262
27   1985  0.003729
28   1989  0.003729
29   1986  0.003197
30   1964  0.003197
31   2021  0.003197
32   1968  0.002664
33   1987  0.002664
34   1970  0.002664
35   1981  0.002131
36   1973  0.001598
37   1997  0.001598
38   1991  0.001598
39   1980  0.001066
40   1959  0.001066
41   1988  0.001066
42   1975  0.001066
43   1965  0.001066
44   1990  0.001066
45   1955  0.001066
46   1983  0.001066
47   1979  0.001066
48   1946  0.001066


In [14]:
# a new dataframe is created for the all the data after the year 2002 
df_filtered = df.loc[df['year'] >= 2002]
df_filtered.head()

Unnamed: 0,winery,year,rating,num_reviews,region,price,type,body,acidity
0,Teso La Monja,2013,4.9,58,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,2018,4.9,31,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,2009,4.8,1793,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
6,Vega Sicilia,2010,4.8,1201,Ribera del Duero,349.0,Ribera Del Duero Red,5.0,3.0
8,Vega Sicilia,2015,4.8,643,Ribera del Duero,345.0,Ribera Del Duero Red,5.0,3.0


In [15]:
df_filtered.isna().sum()

winery           0
year             0
rating           0
num_reviews      0
region           0
price            0
type             0
body           158
acidity        158
dtype: int64

##**Preprocessing**##

In [16]:
scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')

num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

num_pipe = make_pipeline(mean_imputer, scaler)
cat_pipe = make_pipeline(freq_imputer, ohe)

preprocessor = make_column_transformer((num_pipe, num_selector), (cat_pipe, cat_selector))

preprocessor

##**Initial Modeling and Evaluating**##

In [17]:
def eval_regression(true, pred, name='Model'):
    
    """Evaluates true and predicted values from a regression model.  
    Outputs a dataframe of metrics"""
    
    scores = pd.DataFrame()
    scores['Model Name'] = [name]
    scores['RMSE'] = [np.sqrt(mean_squared_error(true, pred))]
    scores['MAE'] = [mean_absolute_error(true, pred)]
    scores['R2'] = [r2_score(true, pred)]
    scores.set_index('Model Name', inplace=True)

    return scores

In [18]:
y = df_filtered['rating']
X = df_filtered.drop(columns='rating')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
lin_reg = LinearRegression()
lin_reg_pipe = make_pipeline(preprocessor, lin_reg)
lin_reg_pipe.fit(X_train, y_train)
lin_reg_y_pred =lin_reg_pipe.predict(X_test)

eval_regression(y_test, lin_reg_y_pred, name='Linear Regression Regressor:')

Unnamed: 0_level_0,RMSE,MAE,R2
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Linear Regression Regressor:,242866700000.0,52845800000.0,-3.255144e+24


In [20]:
rf_reg = RandomForestRegressor(n_estimators=150, random_state=42)
rf_reg_pipe = make_pipeline(preprocessor, rf_reg)
rf_reg_pipe.fit(X_train, y_train)
rf_reg_pipe_y_pred = rf_reg_pipe.predict(X_test)

eval_regression(y_test, rf_reg_pipe_y_pred, name='RandomForest Regressor:')

Unnamed: 0_level_0,RMSE,MAE,R2
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RandomForest Regressor:,0.092941,0.07266,0.523295


In [21]:
dec_tree = DecisionTreeRegressor(random_state = 42)
dec_tree_pipe = make_pipeline(preprocessor, dec_tree)
dec_tree_pipe.fit(X_train, y_train)
dec_tree_y_pred = dec_tree_pipe.predict(X_test)

eval_regression(y_test, dec_tree_y_pred, name='DecisionTree Regressor:')

Unnamed: 0_level_0,RMSE,MAE,R2
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DecisionTree Regressor:,0.134057,0.091408,0.00822


In [22]:
bag_reg = BaggingRegressor(random_state = 42)
bag_reg_pipe = make_pipeline(preprocessor, bag_reg)
bag_reg_pipe.fit(X_train, y_train)
bag_reg_y_pred = bag_reg_pipe.predict(X_test)

eval_regression(y_test, bag_reg_y_pred, name='BaggedTree Regressor:')

Unnamed: 0_level_0,RMSE,MAE,R2
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaggedTree Regressor:,0.09557,0.073365,0.495944


In [23]:
xgb_reg = xgb.XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
xgb_reg_pipe = make_pipeline(preprocessor, xgb_reg)
xgb_reg_pipe.fit(X_train, y_train)
xgb_reg_pipe_y_pred = xgb_reg_pipe.predict(X_test)

eval_regression(y_test, xgb_reg_pipe_y_pred, name='XGB Regressor:')

Unnamed: 0_level_0,RMSE,MAE,R2
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XGB Regressor:,0.092953,0.07326,0.523174


In [24]:
data = {'Model Name': ['Linear Regression Regressor', 'RandomForest Regressor', 'DecisionTree Regressor', 'BaggedTree Regressor', 'XGB Regressor'],
        'RMSE': [1.218042e+11, 0.093025, 0.124743, 0.097686, 0.090884],
        'MAE': [4.180978e+10, 0.072844, 0.085919, 0.075656, 0.072697],
        'R2': [-8.187646e+23, 0.522433, 0.141248, 0.473378, 0.544167]}

initial_metrics_df = pd.DataFrame(data)

print(initial_metrics_df)

                    Model Name          RMSE           MAE            R2
0  Linear Regression Regressor  1.218042e+11  4.180978e+10 -8.187646e+23
1       RandomForest Regressor  9.302500e-02  7.284400e-02  5.224330e-01
2       DecisionTree Regressor  1.247430e-01  8.591900e-02  1.412480e-01
3         BaggedTree Regressor  9.768600e-02  7.565600e-02  4.733780e-01
4                XGB Regressor  9.088400e-02  7.269700e-02  5.441670e-01


##**Further Modeling and Evaluation (Hyperparameter Tuning)**##

The RandomForest Regressor and XGB Regressor have emerged as the most suitable candidates for further optimization through hyperparameter tuning. The results above indicate a rather weak linear correlation, which explains why these two models, known for their ability to capture complex relationships in data, have outperformed other models. As we continue to refine these models using hyperparameter tuning, we can further enhance their capacity to model the underlying patterns in the dataset and ultimately improve their predictive accuracy.

In [25]:
rf_param_grid = {
    'randomforestregressor__n_estimators': [50, 100, 200, 300],
    'randomforestregressor__max_depth': [None, 10, 20, 30, 40],
    'randomforestregressor__min_samples_split': [2, 5, 10, 15],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__max_features': [1.0, 'sqrt', 'log2'],
};

xgb_param_grid = {
    'xgbregressor__n_estimators': [100, 200, 300, 400],
    'xgbregressor__max_depth': [3, 6, 9, 12],
    'xgbregressor__learning_rate': [0.001, 0.01, 0.1, 0.2],
    'xgbregressor__subsample': [0.5, 0.75, 1],
    'xgbregressor__colsample_bytree': [0.5, 0.75, 1],
    'xgbregressor__gamma': [0, 0.25, 0.5, 1],
};

rf_pipeline = make_pipeline(preprocessor, rf_reg)
xgb_pipeline = make_pipeline(preprocessor, xgb_reg)

n_iter_search = 150
rf_random_search = RandomizedSearchCV(rf_pipeline, param_distributions=rf_param_grid, n_iter=n_iter_search)
xgb_random_search = RandomizedSearchCV(xgb_pipeline, param_distributions=xgb_param_grid, n_iter=n_iter_search)

rf_random_search.fit(X_train, y_train)
xgb_random_search.fit(X_train, y_train)

best_rf_model = rf_random_search.best_estimator_
best_xgb_model = xgb_random_search.best_estimator_

In [26]:
best_rf_y_pred = best_rf_model.predict(X_test)
best_xgb_y_pred = best_xgb_model.predict(X_test)

eval_regression(y_test, best_rf_y_pred, name='Best RandomForest Regressor:')

Unnamed: 0_level_0,RMSE,MAE,R2
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Best RandomForest Regressor:,0.092137,0.073544,0.531505


In [27]:
eval_regression(y_test, best_xgb_y_pred, name='Best XGBoost Regressor:')

Unnamed: 0_level_0,RMSE,MAE,R2
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Best XGBoost Regressor:,0.088178,0.071941,0.5709


Although the XGBoost Regressor outperforms the RandomForest Regressor, its R2 score is still relatively low, indicating a limited ability to capture the underlying non-linear relationships in the data. To improve model performance, it is recommended to gather more data to help the machine learning algorithm better understand the complex relationships within the dataset. Nevertheless, considering the current results, the XGBoost Regressor should be implemented for its superior performance over the RandomForest Regressor.