In [340]:
# Initial imports.
import pandas as pd
import numpy as np
from numpy import random
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [7]:
weatherfile = '/content/winery_weather.csv'
wineweather_df = pd.read_csv(weatherfile, index_col=0)
wineweather_df.head()

Unnamed: 0,Winery,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm)
0,St. Julian winery US,269.6375,1018.15,79.24,0.0725
1,Sweet Cheeks winery US,276.025,1019.8875,85.4525,0.0575
2,Kirkland Signature winery US,277.375,1020.0125,79.1975,0.235
3,Louis M. Martini winery US,281.0325,1018.5,79.155,0.0775
4,Richard Böcking winery Germany,275.71,1014.9425,92.215,0.135


In [6]:
wineweather_df.dtypes

Unnamed: 0                            int64
Winery                               object
Average Temperature (Kelvin)        float64
Average Air Pressure (hPa)          float64
Average Humidity (%)                float64
Average Daily Precipitation (mm)    float64
dtype: object

In [12]:
scorefile = '/content/winemag_top8score.csv'
readscore = pd.read_csv(scorefile)
score_df = readscore.drop(columns=['Unnamed: 0'])
score_df.head()

Unnamed: 0,country,points,price,title,variety,winery
0,US,87,13.0,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
1,US,87,65.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
2,US,87,19.0,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
3,US,87,34.0,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini
4,US,87,12.0,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou


In [13]:
score_df.dtypes

country     object
points       int64
price      float64
title       object
variety     object
winery      object
dtype: object

In [34]:
wineweather_df['score'] = score_df['points']
wineweather_df['variety'] = score_df['variety']
wineweather_df.head()

Unnamed: 0,Winery,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),score,variety
0,St. Julian winery US,269.6375,1018.15,79.24,0.0725,87,Riesling
1,Sweet Cheeks winery US,276.025,1019.8875,85.4525,0.0575,87,Pinot Noir
2,Kirkland Signature winery US,277.375,1020.0125,79.1975,0.235,87,Cabernet Sauvignon
3,Louis M. Martini winery US,281.0325,1018.5,79.155,0.0775,87,Cabernet Sauvignon
4,Richard Böcking winery Germany,275.71,1014.9425,92.215,0.135,87,Chardonnay


In [20]:
wineweather_df.dtypes

Winery                               object
Average Temperature (Kelvin)        float64
Average Air Pressure (hPa)          float64
Average Humidity (%)                float64
Average Daily Precipitation (mm)    float64
score                                 int64
variety                              object
dtype: object

In [135]:
wineweather_df.describe()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),score
count,7892.0,7892.0,7892.0,7892.0,7892.0
mean,280.023354,1016.093086,75.890336,0.107172,88.535859
std,6.47867,9.364375,9.738065,0.076782,3.165499
min,263.445,907.9875,35.68,0.0,80.0
25%,276.4775,1016.24,71.7725,0.055,86.0
50%,279.685,1018.895,78.3575,0.09375,88.0
75%,281.7425,1020.2575,81.31,0.145,91.0
max,300.575,1027.6075,92.215,0.9175,99.0


In [111]:
wineweather_df['score'].describe()

count    7892.000000
mean       88.535859
std         3.165499
min        80.000000
25%        86.000000
50%        88.000000
75%        91.000000
max        99.000000
Name: score, dtype: float64

In [112]:
wineweather_df['score'].unique()

array([87, 86, 85, 88, 92, 91, 90, 89, 83, 82, 81, 80, 97, 96, 95, 93, 94,
       84, 99, 98])

In [138]:
wineweather_df['variety'].value_counts()

Pinot Noir            1902
Chardonnay            1619
Cabernet Sauvignon    1418
Sauvignon Blanc        747
Riesling               744
Syrah                  588
Rosé                   441
Merlot                 433
Name: variety, dtype: int64

In [37]:
#Features of all
Xall = wineweather_df.copy()
Xall = Xall.drop(columns=['Winery','score','variety'])
Xall.head()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm)
0,269.6375,1018.15,79.24,0.0725
1,276.025,1019.8875,85.4525,0.0575
2,277.375,1020.0125,79.1975,0.235
3,281.0325,1018.5,79.155,0.0775
4,275.71,1014.9425,92.215,0.135


In [109]:
#Target
yall = wineweather_df['score'].ravel()
yall

array([87, 87, 87, ..., 91, 91, 91])

In [113]:
#Split
Xall_train, Xall_test, yall_train, yall_test = train_test_split(Xall,yall,random_state=99)

In [122]:
#Scaler made and fit
scaler = StandardScaler()
Xall_scaler = scaler.fit(Xall_train)

# Scaling the data
Xall_train_scaled = Xall_scaler.transform(X_train)
Xall_test_scaled = Xall_scaler.transform(X_test)

In [123]:
Xall_train_scaled

array([[-0.49956474,  0.05001811,  0.98858686,  0.16570393],
       [-1.65817618,  0.51801975,  0.34942681, -0.94122207],
       [ 0.03599032,  0.01403834,  0.01580638, -1.03889201],
       ...,
       [-1.73044291, -0.80899903,  0.80773625, -1.00633536],
       [ 0.04028726,  0.43759437, -0.96264177, -1.23423189],
       [ 0.28130656,  0.32409934, -0.57028355,  0.23081722]])

In [124]:
#Init model
rf_model = RandomForestClassifier(n_estimators=140, random_state=99) 

In [None]:
#https://courses.bootcampspot.com/courses/488/pages/17-dot-8-3-fit-the-model-make-predictions-and-evaluate-results?module_item_id=117963

In [140]:
# Fitting the model
rf_model_all = rf_model.fit(Xall_train, yall_train)

In [141]:
# Making predictions using the testing data.
predictions_all = rf_model_all.predict(Xall_test)
predictions_all

array([89, 87, 90, ..., 88, 87, 86])

In [90]:
#i dont think this is that useful
# Calculating the confusion matrix.
#cm_all = confusion_matrix(yall_test, predictions_all)
#cm_all
# Create a DataFrame from the confusion matrix.
#cmall_df = pd.DataFrame(
#    cm_all, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

#cmall_df

In [142]:
# Calculating the accuracy score.
acc_score_all = accuracy_score(yall_test, predictions_all)
acc_score_all

0.10745058286872783

In [143]:
# Calculate feature importance in the Random Forest model.
importances = rf_model_all.feature_importances_
importances

array([0.25711982, 0.25596895, 0.2614882 , 0.22542302])

In [144]:
# We can sort the features by their importance.
sorted(zip(rf_model_all.feature_importances_, Xall.columns), reverse=True)

[(0.26148820491838354, 'Average Humidity (%)'),
 (0.2571198236390153, 'Average Temperature (Kelvin)'),
 (0.2559689538869403, 'Average Air Pressure (hPa)'),
 (0.22542301755566085, 'Average Daily Precipitation (mm)')]

In [None]:
#none of these are particularly weighted comparitively high. let's boost

In [146]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=4,
   max_depth=25,
   random_state=99)
   classifier.fit(Xall_train, yall_train)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           Xall_train,
           yall_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           Xall_test,
           yall_test)))

Learning rate:  0.05
Accuracy score (training): 0.215
Accuracy score (validation): 0.110
Learning rate:  0.1
Accuracy score (training): 0.215
Accuracy score (validation): 0.109
Learning rate:  0.25
Accuracy score (training): 0.212
Accuracy score (validation): 0.106
Learning rate:  0.5
Accuracy score (training): 0.211
Accuracy score (validation): 0.098
Learning rate:  0.75
Accuracy score (training): 0.208
Accuracy score (validation): 0.100
Learning rate:  1
Accuracy score (training): 0.200
Accuracy score (validation): 0.101


In [189]:
#trying SMOTE first to get better groupings. after this I'm going to try by variety instead
#i don't know what this means. damn it. ok let's try isolating variety
#from imblearn.over_sampling import SMOTE
#Xall_resampled, yall_resampled = SMOTE(random_state=99,
#sampling_strategy='auto').fit_resample(
#   Xall_train, yall_train)

In [155]:
#Feature df
X = wineweather_df.copy()
X = X.drop(columns=['Winery'])
X.head()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),score,variety
0,269.6375,1018.15,79.24,0.0725,87,Riesling
1,276.025,1019.8875,85.4525,0.0575,87,Pinot Noir
2,277.375,1020.0125,79.1975,0.235,87,Cabernet Sauvignon
3,281.0325,1018.5,79.155,0.0775,87,Cabernet Sauvignon
4,275.71,1014.9425,92.215,0.135,87,Chardonnay


In [156]:
X.describe()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),score
count,7892.0,7892.0,7892.0,7892.0,7892.0
mean,280.023354,1016.093086,75.890336,0.107172,88.535859
std,6.47867,9.364375,9.738065,0.076782,3.165499
min,263.445,907.9875,35.68,0.0,80.0
25%,276.4775,1016.24,71.7725,0.055,86.0
50%,279.685,1018.895,78.3575,0.09375,88.0
75%,281.7425,1020.2575,81.31,0.145,91.0
max,300.575,1027.6075,92.215,0.9175,99.0


In [157]:
X['variety'].value_counts()

Pinot Noir            1902
Chardonnay            1619
Cabernet Sauvignon    1418
Sauvignon Blanc        747
Riesling               744
Syrah                  588
Rosé                   441
Merlot                 433
Name: variety, dtype: int64

In [158]:
#divide varieties
Xries = X[X['variety'] == 'Riesling']
Xpinot = X[X['variety'] == 'Pinot Noir']
Xcabsav = X[X['variety'] == 'Cabernet Sauvignon']
Xchard = X[X['variety'] == 'Chardonnay']
Xmerl = X[X['variety'] == 'Merlot']
Xsavyb = X[X['variety'] == 'Sauvignon Blanc']
Xrose = X[X['variety'] == 'Rosé']
Xsyrah = X[X['variety'] == 'Syrah']

In [161]:
Xpin = Xpinot.drop(columns=['score','variety'])
Xpin.head()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm)
1,276.025,1019.8875,85.4525,0.0575
6,281.0325,1018.5,79.155,0.0775
8,276.22,1020.7575,76.705,0.14
10,276.0775,1020.8975,74.8725,0.135
12,269.755,1013.96,58.3,0.01


In [160]:
#target dfs
ypin = Xpinot['score'].ravel()
ypin

array([87, 87, 87, ..., 91, 91, 91])

In [162]:
#Split
Xpin_train, Xpin_test, ypin_train, ypin_test = train_test_split(Xpin,ypin,random_state=99)

In [163]:
#init model again
rf_model = RandomForestClassifier(n_estimators=140, random_state=99) 

In [164]:
rf_model_pin = rf_model.fit(Xpin, ypin)

In [167]:
predictions_pin = rf_model_pin.predict(Xpin_test)
predictions_pin

array([91, 92, 92, 86, 86, 90, 87, 90, 85, 90, 82, 91, 89, 90, 88, 90, 93,
       85, 92, 90, 88, 88, 93, 92, 87, 90, 91, 85, 91, 85, 91, 93, 91, 92,
       90, 92, 91, 87, 91, 91, 90, 89, 91, 91, 90, 90, 91, 92, 86, 90, 92,
       91, 92, 86, 93, 87, 91, 90, 91, 90, 86, 86, 91, 86, 86, 90, 85, 92,
       90, 91, 90, 90, 86, 91, 92, 90, 90, 86, 85, 91, 92, 93, 91, 91, 89,
       92, 92, 86, 89, 89, 93, 89, 91, 87, 91, 92, 82, 82, 91, 91, 92, 91,
       91, 90, 92, 90, 91, 93, 91, 89, 91, 89, 89, 89, 92, 90, 90, 86, 87,
       87, 86, 87, 88, 91, 90, 87, 91, 92, 91, 91, 92, 90, 91, 87, 85, 91,
       89, 93, 85, 86, 92, 91, 90, 90, 92, 91, 87, 90, 88, 89, 87, 91, 87,
       89, 85, 85, 86, 90, 91, 91, 90, 92, 87, 92, 86, 89, 90, 87, 97, 90,
       90, 93, 88, 89, 90, 91, 94, 90, 90, 90, 88, 91, 94, 86, 91, 86, 89,
       90, 87, 86, 92, 90, 90, 91, 90, 91, 92, 90, 87, 87, 90, 91, 90, 90,
       92, 90, 90, 90, 91, 90, 92, 91, 90, 90, 91, 92, 92, 90, 91, 85, 86,
       91, 91, 91, 86, 90

In [168]:
# Calculating the accuracy score.
acc_score_pin = accuracy_score(ypin_test, predictions_pin)
acc_score_pin

0.28781512605042014

In [169]:
#let's try this
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=4,
   max_depth=25,
   random_state=99)
   classifier.fit(Xpin_train, ypin_train)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           Xpin_train,
           ypin_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           Xpin_test,
           ypin_test)))

Learning rate:  0.05
Accuracy score (training): 0.316
Accuracy score (validation): 0.113
Learning rate:  0.1
Accuracy score (training): 0.316
Accuracy score (validation): 0.109
Learning rate:  0.25
Accuracy score (training): 0.314
Accuracy score (validation): 0.111
Learning rate:  0.5
Accuracy score (training): 0.313
Accuracy score (validation): 0.097
Learning rate:  0.75
Accuracy score (training): 0.308
Accuracy score (validation): 0.105
Learning rate:  1
Accuracy score (training): 0.290
Accuracy score (validation): 0.086


In [171]:
Xpinot['score'].value_counts()

92    235
91    219
87    209
90    208
88    184
89    149
93    141
86    137
94    117
85    108
84     81
95     31
82     27
83     27
96     14
81      7
97      4
80      2
98      1
99      1
Name: score, dtype: int64

In [176]:
#let's try classification instead. I'll make success >= 90
Xpinot['scorestomp'] = (Xpinot['score'] >= 91).astype(int)
Xpinot['scorestomp'].value_counts()
print(Xpinot['scorestomp'].value_counts())
print(Xpinot.columns)

0    1139
1     763
Name: scorestomp, dtype: int64
Index(['Average Temperature (Kelvin)', 'Average Air Pressure (hPa)',
       'Average Humidity (%)', 'Average Daily Precipitation (mm)', 'score',
       'variety', 'scorestomp'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [177]:
Xpin = Xpinot.drop(columns=['score','variety','scorestomp'])
ypin = Xpinot['scorestomp'].ravel()

In [178]:
Xpin_test, Xpin_train, ypin_test, ypin_train = train_test_split(Xpin, ypin)

In [186]:
#init model again
rf_model = RandomForestClassifier(n_estimators=256, random_state=99) 
rf_model_pin = rf_model.fit(Xpin, ypin)

In [187]:
pin_predicts = rf_model_pin.predict(Xpin_test)
pin_predicts

array([0, 1, 0, ..., 0, 0, 1])

In [188]:
# Calculating the accuracy score.
acc_score_pin = accuracy_score(ypin_test, pin_predicts)
acc_score_pin

0.6725105189340813

In [None]:
#that's a hell of a lot better. let's try the whole dataset

In [192]:
#Features of all
Xall = wineweather_df.copy()
Xall = Xall.drop(columns=['Winery','variety'])
Xall.head()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),score
0,269.6375,1018.15,79.24,0.0725,87
1,276.025,1019.8875,85.4525,0.0575,87
2,277.375,1020.0125,79.1975,0.235,87
3,281.0325,1018.5,79.155,0.0775,87
4,275.71,1014.9425,92.215,0.135,87


In [309]:
Xall['scorestomp'] = (Xall['score'] >= 92).astype(int)
Xall['scorestomp'].value_counts()
print(Xall['scorestomp'].value_counts())
print(Xall.columns)

0    6306
1    1586
Name: scorestomp, dtype: int64
Index(['Average Temperature (Kelvin)', 'Average Air Pressure (hPa)',
       'Average Humidity (%)', 'Average Daily Precipitation (mm)', 'score',
       'scorestomp'],
      dtype='object')


In [310]:
Xtot = Xall.drop(columns=['score','scorestomp'])
ytot = Xall['scorestomp'].ravel()

In [311]:
Xtot_train, Xtot_test, ytot_train, ytot_test = train_test_split(Xtot,ytot, random_state=99)

In [349]:
Xtot_train['real_success'] = ytot_train
Xtot_train.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),real_success
count,5919.0,5919.0,5919.0,5919.0,5919.0
mean,279.942165,1016.089841,76.021613,0.107276,0.203075
std,6.400443,9.450552,9.704962,0.076796,0.402322
min,263.445,907.9875,35.68,0.0,0.0
25%,276.4575,1016.2875,71.7725,0.055,0.0
50%,279.46,1018.9125,78.3575,0.095,0.0
75%,281.7425,1020.2575,81.31,0.145,0.0
max,300.575,1027.6075,92.215,0.9175,1.0


In [350]:
Xtot_train['real_success'].value_counts()

0    4717
1    1202
Name: real_success, dtype: int64

In [351]:
Xtot_test['real_success'] = ytot_test
Xtot_test.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),predicted_success,real_success
count,1973.0,1973.0,1973.0,1973.0,1973.0,1973.0
mean,280.26692,1016.102822,75.496508,0.106859,0.010644,0.194627
std,6.703641,9.103296,9.828663,0.07676,0.102644,0.396014
min,263.445,942.7025,35.68,0.0,0.0,0.0
25%,276.745,1016.2225,71.7725,0.055,0.0,0.0
50%,280.0375,1018.895,77.8,0.0925,0.0,0.0
75%,281.9275,1020.2575,80.53,0.145,0.0,0.0
max,298.5275,1027.6075,92.215,0.9175,1.0,1.0


In [352]:
Xtot_test['real_success'].value_counts()

0    1589
1     384
Name: real_success, dtype: int64

In [312]:
#init model again
rf_model = RandomForestClassifier(n_estimators=256, random_state=99) 
rf_model_tot = rf_model.fit(Xtot, ytot)

In [313]:
tot_predicts = rf_model_tot.predict(Xtot_test)
tot_predicts

array([0, 0, 0, ..., 0, 0, 0])

In [314]:
# Calculating the accuracy score.
acc_score_tot = accuracy_score(ytot_test, tot_predicts)
acc_score_tot

0.8109477952356817

In [315]:
#.64267 accscore when success >=90 n_est=256
#.71312 accscore when success >=91
#.81246 accscore when success >=92
#.89102 accscore when success >=93
#.93765 accscore when success >=94

In [345]:
Xtot_test['predicted_success'] = tot_predicts
Xtot_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),predicted_success
4477,282.145,1019.67,71.3375,0.0475,0
4642,276.885,1020.4425,71.7725,0.19,0
5709,281.0325,1018.5,79.155,0.0775,0
1416,281.0325,1018.5,79.155,0.0775,0
5836,295.3925,1012.035,53.5275,0.0225,0


In [346]:
Xtot_test['predicted_success'].value_counts()

0    1952
1      21
Name: predicted_success, dtype: int64

In [348]:
Xtot_test.describe()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm),predicted_success
count,1973.0,1973.0,1973.0,1973.0,1973.0
mean,280.26692,1016.102822,75.496508,0.106859,0.010644
std,6.703641,9.103296,9.828663,0.07676,0.102644
min,263.445,942.7025,35.68,0.0,0.0
25%,276.745,1016.2225,71.7725,0.055,0.0
50%,280.0375,1018.895,77.8,0.0925,0.0
75%,281.9275,1020.2575,80.53,0.145,0.0
max,298.5275,1027.6075,92.215,0.9175,1.0


In [317]:
#let's try this for all data by classification
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=200,
   learning_rate=learning_rate,
   max_features=4,
   max_depth=25,
   random_state=99)
   classifier.fit(Xtot_train, ytot_train)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           Xtot_train,
           ytot_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           Xtot_test,
           ytot_test)))

Learning rate:  0.05
Accuracy score (training): 0.806
Accuracy score (validation): 0.798
Learning rate:  0.1
Accuracy score (training): 0.806
Accuracy score (validation): 0.800
Learning rate:  0.25
Accuracy score (training): 0.806
Accuracy score (validation): 0.798
Learning rate:  0.5
Accuracy score (training): 0.806
Accuracy score (validation): 0.800
Learning rate:  0.75
Accuracy score (training): 0.806
Accuracy score (validation): 0.800
Learning rate:  1
Accuracy score (training): 0.806
Accuracy score (validation): 0.801


In [None]:
#ok, that test is getting us nowhere. let's boost the all data and see what kind of model accuracy we get

In [319]:
from imblearn.over_sampling import SMOTE
Xtot_resampled, ytot_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   Xtot_train, ytot_train)



In [320]:
#init model again
rf_model = RandomForestClassifier(n_estimators=256, random_state=99) 
rf_model_totrs = rf_model.fit(Xtot_resampled, ytot_resampled)

In [321]:
totrs_predicts = rf_model_totrs.predict(Xtot_test)
totrs_predicts

array([1, 1, 0, ..., 0, 0, 0])

In [322]:
# Calculating the accuracy score.
acc_score_totrs = accuracy_score(ytot_test, tot_predicts)
acc_score_totrs

0.8109477952356817

In [323]:
#cool. let's try logreg

In [324]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=99)
model.fit(Xtot_resampled, ytot_resampled)

y_pred = model.predict(Xtot_test)

lracc_score_totrs = accuracy_score(ytot_test, y_pred)
lracc_score_totrs

0.5509376583882413

In [325]:
#ok so logreg is out. let's scale and go back to RandomForest, then try ROS

In [326]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
Xtot_scaler = scaler.fit(Xtot_train)

# Scaling the data.
Xtot_train_scaled = Xtot_scaler.transform(X_train)
Xtot_test_scaled = Xtot_scaler.transform(X_test)

In [327]:
#init model again
rf_model = RandomForestClassifier(n_estimators=256, random_state=99) 
scalrf_model_tot = rf_model.fit(Xtot_train_scaled, ytot_train)

In [328]:
scaltot_predicts = scalrf_model_tot.predict(Xtot_test_scaled)
scaltot_predicts

array([0, 0, 0, ..., 0, 0, 0])

In [329]:
# Calculating the confusion matrix.
cm = confusion_matrix(ytot_test, scaltot_predicts)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1573,16
Actual 1,381,3


In [331]:
# Calculating the accuracy score.
acc_score_scaltot = accuracy_score(ytot_test, scaltot_predicts)
acc_score_scaltot

0.7987835783071465

In [334]:
#scaling dropped acc but not by much. what happens if we feed this model a random location
Xtot.describe()

Unnamed: 0,Average Temperature (Kelvin),Average Air Pressure (hPa),Average Humidity (%),Average Daily Precipitation (mm)
count,7892.0,7892.0,7892.0,7892.0
mean,280.023354,1016.093086,75.890336,0.107172
std,6.47867,9.364375,9.738065,0.076782
min,263.445,907.9875,35.68,0.0
25%,276.4775,1016.24,71.7725,0.055
50%,279.685,1018.895,78.3575,0.09375
75%,281.7425,1020.2575,81.31,0.145
max,300.575,1027.6075,92.215,0.9175


In [342]:
#divide varieties
Xries = X[X['variety'] == 'Riesling']
Xpinot = X[X['variety'] == 'Pinot Noir']
Xcabsav = X[X['variety'] == 'Cabernet Sauvignon']
Xchard = X[X['variety'] == 'Chardonnay']
Xmerl = X[X['variety'] == 'Merlot']