In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import seaborn as sns

#read in data
wine = pd.read_csv('wine_reviews.csv', encoding = "ISO-8859-1")

# drop columns
wine = wine.drop(columns=['Unnamed: 0', 'region_1', 'region_2', 'taster_twitter_handle', 'designation'])
# drop duplicates
wine = wine.drop_duplicates('description')
wine.shape

(119955, 9)

In [0]:
# drop outliers for price
# change price column from float to integer
data_minus_outliers = wine[wine['price'] < 500]


In [0]:
data_minus_outliers = data_minus_outliers.astype({"price": int}) 
data_minus_outliers['price'].dtype

dtype('int64')

In [0]:
cleaned_data = data_minus_outliers

In [0]:
# add vintage column
cleaned_data[['vintage']] = cleaned_data.title.str.extract(r'(\d{4})', expand=True)
# drop missing values
cleaned_data.dropna(inplace=True)
cleaned_data.shape

(85201, 10)

In [0]:
cleaned_data.dtypes

country        object
description    object
points          int64
price           int64
province       object
taster_name    object
title          object
variety        object
winery         object
vintage        object
dtype: object

In [0]:
cleaned_data=cleaned_data.astype({'vintage': int})
cleaned_data.shape

(85201, 10)

In [0]:
# add description length
cleaned_data = cleaned_data.assign(description_length = cleaned_data['description'].apply(len))


<h4>Predict points using price, vintage, description length</h4>

In [0]:
X = cleaned_data[['price', 'vintage', 'description_length']].values
y = cleaned_data[['points']].values

<h4>Multiple Linear Regression</h4>

In [0]:
#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
                      
#Normalizing the features
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Fitting Multiple Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
mlrObj = LinearRegression()
mlrObj.fit(X_train,y_train)

#Predicting on the Test Set
y_pred = mlrObj.predict(X_test)

In [0]:
#Model Accuracy
print(mlrObj.score(X_test,y_test))

0.426037705384378


<h4>Decision Tree Regression</h4>

In [0]:
#Create and Fit a Decision Tree
from sklearn.tree import DecisionTreeRegressor
dtreg = DecisionTreeRegressor(max_depth=8)
dtreg.fit(X_train,y_train) 


DecisionTreeRegressor(criterion='mse', max_depth=8, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [0]:
print(dtreg.score(X_test, y_test))

0.5251188282922559


<h4>Random Forest Regression</h4>

In [0]:
#Create and Fit a Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train,y_train)

  after removing the cwd from sys.path.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
print(rf.score(X_test, y_test))

0.38891807762930963


<h4>Predict price using points, vintage, description length</h4>

In [0]:
X = cleaned_data[['points', 'vintage', 'description_length']].values
y = cleaned_data[['price']].values

<h4> Multiple Linear Regression</h4>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
                      
#Normalizing the features
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Fitting Multiple Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
mlrObj = LinearRegression()
mlrObj.fit(X_train,y_train)

#Predicting on the Test Set
y_pred = mlrObj.predict(X_test)

In [0]:
print(mlrObj.score(X_test, y_test))

0.2506921314555375


<h4>Decision Tree Regression</h4>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
dtreg = DecisionTreeRegressor(max_depth=8)
dtreg.fit(X_train,y_train) 

DecisionTreeRegressor(criterion='mse', max_depth=8, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [0]:
print(dtreg.score(X_test, y_test))

0.33506961939829394


<h4>Random Forest Regression</h4>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
print(rf.score(X_test, y_test))

0.19706453080194963


<h3> Add categorical variables to models </h3>

In [0]:
# label encode country, variety columns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
  
le = LabelEncoder() 
  
cleaned_data['country']= le.fit_transform(cleaned_data['country']) 
cleaned_data['variety']= le.fit_transform(cleaned_data['variety']) 
cleaned_data['taster_name']=le.fit_transform(cleaned_data['taster_name'])

In [0]:
# Predict points using country, price, vintage, description length
X = cleaned_data[['country', 'price', 'vintage', 'description_length' ]].values
y = cleaned_data['points'].values

In [0]:
# One hot encode for X
onehotencoder= OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


<h4> Multiple Linear Regression with Country </h4>

In [0]:
#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
                      
#Fitting Multiple Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
mlrObj = LinearRegression()
mlrObj.fit(X_train,y_train)

#Model Accuracy
print(mlrObj.score(X_test, y_test))

0.4817182672994284


<h4> Decision Tree with Country </h4>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
dtr = DecisionTreeRegressor(max_depth=8)
dtr.fit(X_train,y_train)
#Model Accuracy
print(dtr.score(X_test, y_test))

0.5430999698579158


<h4> Random Forest with Country </h4>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
rf = RandomForestRegressor(n_estimators=50)
rf.fit(X_train,y_train)
#Model Accuracy
print(rf.score(X_test, y_test))

0.5023356688310994


<h4> Multiple Linear Regression with Country, Taster Name, and Variety </h4>

In [0]:
# Predict points using country, variety, price, vintage, description length
X = cleaned_data[['country', 'taster_name', 'variety', 'price', 'vintage', 'description_length' ]].values
y = cleaned_data['points'].values

In [0]:
# One hot encode for X
onehotencoder= OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
                      
#Fitting Multiple Linear Regression to Training Set
mlrObj = LinearRegression()
mlrObj.fit(X_train,y_train)

#Model Accuracy
print(mlrObj.score(X_test, y_test))

0.4768770455321631


<h4> Decision Tree </h4>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
dtr = DecisionTreeRegressor(max_depth=8)
dtr.fit(X_train,y_train)
#Model Accuracy
print(dtr.score(X_test, y_test))

0.5403708388463264


<h4> Random Forest </h4>

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
rf = RandomForestRegressor(n_estimators=50)
rf.fit(X_train,y_train)
#Model Accuracy
print(rf.score(X_test, y_test))

0.5781117412140075
