In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor


In [2]:
df=pd.read_csv('../datasets/final_merged_data.csv')

## Testing Seasonal Variables

In [6]:
features=['season_var','season_tmax','season_tavg','season_precip','season_tmin']
X=df[features]
y=df['yield']

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [8]:
ss=StandardScaler()
ss.fit(X_train)
X_train_sc=ss.transform(X_train)
X_test_sc=ss.transform(X_test)

#### Linear Regression

In [9]:
# Non scaled data
lr=LinearRegression()
lr.fit(X_train,y_train)

cv_score=cross_val_score(lr,X_train,y_train).mean()
train_score=lr.score(X_train,y_train)
test_score=lr.score(X_test,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.5015345612872876.
Linear Regression Train Score: 0.5548197687267915.
Linear Regression Test Score: 0.6069095921153396.


In [10]:
# Scaled data
lr=LinearRegression()
lr.fit(X_train_sc,y_train)

cv_score=cross_val_score(lr,X_train_sc,y_train).mean()
train_score=lr.score(X_train_sc,y_train)
test_score=lr.score(X_test_sc,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.5015345612872878.
Linear Regression Train Score: 0.5548197687267917.
Linear Regression Test Score: 0.60690959211534.


#### KNN

In [11]:
knn=KNeighborsRegressor()
knn.fit(X_train_sc,y_train)

cv_score=cross_val_score(knn,X_train_sc,y_train).mean()
train_score=knn.score(X_train_sc,y_train)
test_score=knn.score(X_test_sc,y_test)

print(f'KNN Cross Val Score: {cv_score}.')
print(f'KNN Train Score: {train_score}.')
print(f'KNN Test Score: {test_score}.')

KNN Cross Val Score: 0.6338470907884661.
KNN Train Score: 0.7962225123717905.
KNN Test Score: 0.7638996310477819.


#### Random Forest

In [12]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)

cv_score=cross_val_score(rf,X_train,y_train).mean()
train_score=rf.score(X_train,y_train)
test_score=rf.score(X_test,y_test)

print(f'Random Forest Cross Val Score: {cv_score}.')
print(f'Random Forest Train Score: {train_score}.')
print(f'Random Forest Test Score: {test_score}.')

Random Forest Cross Val Score: 0.6212694850555656.
Random Forest Train Score: 0.9514934831407964.
Random Forest Test Score: 0.7271234138127234.


#### Adaboost

In [13]:
# Non scaled data
ada=AdaBoostRegressor()
ada.fit(X_train,y_train)

cv_score=cross_val_score(ada,X_train,y_train).mean()
train_score=ada.score(X_train,y_train)
test_score=ada.score(X_test,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.6519881983709063.
Adaboost Train Score: 0.8993498590099491.
Adaboost Test Score: 0.7921838285043102.


In [14]:
# Scaled data
ada=AdaBoostRegressor()
ada.fit(X_train_sc,y_train)

cv_score=cross_val_score(ada,X_train_sc,y_train).mean()
train_score=ada.score(X_train_sc,y_train)
test_score=ada.score(X_test_sc,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.6884446030871729.
Adaboost Train Score: 0.9074217947260853.
Adaboost Test Score: 0.7898113642090064.


## Annual Variables

In [15]:
features=['annual_var','annual_tmax','season_precip','annual_tavg','annual_tmin']
X=df[features]
y=df['yield']

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [17]:
ss=StandardScaler()
ss.fit(X_train)
X_train_sc=ss.transform(X_train)
X_test_sc=ss.transform(X_test)

#### Linear Regression

In [18]:
# Non scaled data
lr=LinearRegression()
lr.fit(X_train,y_train)

cv_score=cross_val_score(lr,X_train,y_train).mean()
train_score=lr.score(X_train,y_train)
test_score=lr.score(X_test,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.5603893312744276.
Linear Regression Train Score: 0.6045624743537759.
Linear Regression Test Score: 0.5441176194359933.


In [19]:
# Scaled data
lr=LinearRegression()
lr.fit(X_train_sc,y_train)

cv_score=cross_val_score(lr,X_train_sc,y_train).mean()
train_score=lr.score(X_train_sc,y_train)
test_score=lr.score(X_test_sc,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.5603893312744274.
Linear Regression Train Score: 0.6045624743537759.
Linear Regression Test Score: 0.5441176194359933.


#### KNN

In [20]:
knn=KNeighborsRegressor()
knn.fit(X_train_sc,y_train)

cv_score=cross_val_score(knn,X_train_sc,y_train).mean()
train_score=knn.score(X_train_sc,y_train)
test_score=knn.score(X_test_sc,y_test)

print(f'KNN Cross Val Score: {cv_score}.')
print(f'KNN Train Score: {train_score}.')
print(f'KNN Test Score: {test_score}.')

KNN Cross Val Score: 0.7954383866258506.
KNN Train Score: 0.8812308113834669.
KNN Test Score: 0.7433582599379942.


#### Random Forest

In [21]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)

cv_score=cross_val_score(rf,X_train,y_train).mean()
train_score=rf.score(X_train,y_train)
test_score=rf.score(X_test,y_test)

print(f'Random Forest Cross Val Score: {cv_score}.')
print(f'Random Forest Train Score: {train_score}.')
print(f'Random Forest Test Score: {test_score}.')

Random Forest Cross Val Score: 0.718523466006648.
Random Forest Train Score: 0.963314186380557.
Random Forest Test Score: 0.7162077891843764.


#### Adaboost

In [22]:
# Non scaled data
ada=AdaBoostRegressor()
ada.fit(X_train,y_train)

cv_score=cross_val_score(ada,X_train,y_train).mean()
train_score=ada.score(X_train,y_train)
test_score=ada.score(X_test,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.7404820198037253.
Adaboost Train Score: 0.9065554119762514.
Adaboost Test Score: 0.6432409222058195.


In [23]:
# Scaled data
ada=AdaBoostRegressor()
ada.fit(X_train_sc,y_train)

cv_score=cross_val_score(ada,X_train_sc,y_train).mean()
train_score=ada.score(X_train_sc,y_train)
test_score=ada.score(X_test_sc,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.7392700720940517.
Adaboost Train Score: 0.9117720967585925.
Adaboost Test Score: 0.6892117687225972.


## All Variables

In [24]:
features=['annual_var','annual_tmax','season_precip','annual_tavg','annual_tmin','season_var','season_tmax','season_tavg','season_precip','season_tmin']
X=df[features]
y=df['yield']

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [26]:
ss=StandardScaler()
ss.fit(X_train)
X_train_sc=ss.transform(X_train)
X_test_sc=ss.transform(X_test)

#### Linear Regression

In [27]:
# Non scaled data
lr=LinearRegression()
lr.fit(X_train,y_train)

cv_score=cross_val_score(lr,X_train,y_train).mean()
train_score=lr.score(X_train,y_train)
test_score=lr.score(X_test,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.702692980822067.
Linear Regression Train Score: 0.7417912620548012.
Linear Regression Test Score: 0.7546505889284679.


In [28]:
# Scaled data
lr=LinearRegression()
lr.fit(X_train_sc,y_train)

cv_score=cross_val_score(lr,X_train_sc,y_train).mean()
train_score=lr.score(X_train_sc,y_train)
test_score=lr.score(X_test_sc,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.7026929808220682.
Linear Regression Train Score: 0.7417912620548013.
Linear Regression Test Score: 0.7546505889284676.


#### KNN

In [29]:
knn=KNeighborsRegressor()
knn.fit(X_train_sc,y_train)

cv_score=cross_val_score(knn,X_train_sc,y_train).mean()
train_score=knn.score(X_train_sc,y_train)
test_score=knn.score(X_test_sc,y_test)

print(f'KNN Cross Val Score: {cv_score}.')
print(f'KNN Train Score: {train_score}.')
print(f'KNN Test Score: {test_score}.')

KNN Cross Val Score: 0.8072432619450793.
KNN Train Score: 0.9037245260145841.
KNN Test Score: 0.7955130048055834.


#### Random Forest

In [30]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)

cv_score=cross_val_score(rf,X_train,y_train).mean()
train_score=rf.score(X_train,y_train)
test_score=rf.score(X_test,y_test)

print(f'Random Forest Cross Val Score: {cv_score}.')
print(f'Random Forest Train Score: {train_score}.')
print(f'Random Forest Test Score: {test_score}.')

Random Forest Cross Val Score: 0.7498103043854301.
Random Forest Train Score: 0.9673754983182293.
Random Forest Test Score: 0.7019499906420636.


#### Adaboost

In [31]:
# Non scaled data
ada=AdaBoostRegressor()
ada.fit(X_train,y_train)

cv_score=cross_val_score(ada,X_train,y_train).mean()
train_score=ada.score(X_train,y_train)
test_score=ada.score(X_test,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.7605722046153891.
Adaboost Train Score: 0.9205450915487601.
Adaboost Test Score: 0.669188756752189.


In [32]:
# Scaled data
ada=AdaBoostRegressor()
ada.fit(X_train_sc,y_train)

cv_score=cross_val_score(ada,X_train_sc,y_train).mean()
train_score=ada.score(X_train_sc,y_train)
test_score=ada.score(X_test_sc,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.761244504269014.
Adaboost Train Score: 0.9166505673386353.
Adaboost Test Score: 0.6713101983245834.


## Seasonal Variables w/ County Dummies

In [33]:
df_dummies=pd.get_dummies(df, columns=['county'], drop_first=True)

In [34]:
df_dummies.head()

Unnamed: 0,year,county code,harvested acres,yield,production,price p/u,value,annual_precip,annual_tavg,annual_tmin,...,season_var,county_Madera,county_Monterey,county_Napa,county_Sacramento,county_San Joaquin,county_San Luis Obispo,county_Santa Barbara,county_Sonoma,county_Yolo
0,2000,19,75139.0,11.12,835310.0,177.0,147550000,20.72,55.708333,31.4,...,17.7,0,0,0,0,0,0,0,0,0
1,2000,39,56410.0,9.92,559563.0,186.0,103830000,25.89,56.516667,31.8,...,18.2,1,0,0,0,0,0,0,0,0
2,2000,55,32365.0,4.23,136962.0,2464.0,337469000,31.24,59.266667,38.1,...,12.2,0,0,1,0,0,0,0,0,0
3,2000,67,25024.0,7.1,177670.0,536.0,95231000,23.98,61.741667,38.8,...,14.8,0,0,0,1,0,0,0,0,0
4,2000,77,80711.0,7.74,624800.0,473.0,295793000,17.57,61.633333,38.1,...,13.8,0,0,0,0,1,0,0,0,0


In [35]:
correlation=df_dummies.drop(columns=['year','county code','harvested acres','production','price p/u','value']).corr()


In [48]:
season_cols=[col for col in df_dummies.columns if col.startswith('season')]
count_cols=[col for col in df_dummies.columns if col.startswith('county_')]

In [51]:
season_cols.remove('season_var')

In [53]:
count_cols

['county_Madera',
 'county_Monterey',
 'county_Napa',
 'county_Sacramento',
 'county_San Joaquin',
 'county_San Luis Obispo',
 'county_Santa Barbara',
 'county_Sonoma',
 'county_Yolo']

In [54]:
features=season_cols+count_cols
features
X=df_dummies[features]
y=df_dummies['yield']

In [55]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [56]:
ss=StandardScaler()
ss.fit(X_train)
X_train_sc=ss.transform(X_train)
X_test_sc=ss.transform(X_test)

#### Linear Regression

In [57]:
# Non scaled data
lr=LinearRegression()
lr.fit(X_train,y_train)

cv_score=cross_val_score(lr,X_train,y_train).mean()
train_score=lr.score(X_train,y_train)
test_score=lr.score(X_test,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.8479786035277804.
Linear Regression Train Score: 0.887608444520364.
Linear Regression Test Score: 0.8879845749881056.


In [58]:
# Scaled data
lr=LinearRegression()
lr.fit(X_train_sc,y_train)

cv_score=cross_val_score(lr,X_train_sc,y_train).mean()
train_score=lr.score(X_train_sc,y_train)
test_score=lr.score(X_test_sc,y_test)

print(f'Linear Regression Cross Val Score: {cv_score}.')
print(f'Linear Regression Train Score: {train_score}.')
print(f'Linear Regression Test Score: {test_score}.')

Linear Regression Cross Val Score: 0.8479786035277795.
Linear Regression Train Score: 0.8876084445203641.
Linear Regression Test Score: 0.8879845749881049.


In [47]:
lr.coef_

array([-0.14463257, -0.59272421,  0.30448915,  0.3878548 ,  0.13014379,
       -0.33577696, -2.0323009 , -2.37248629, -1.36386094, -1.40966093,
       -2.25703011, -2.18220478, -1.71237081, -1.29446122])

#### KNN

In [43]:
knn=KNeighborsRegressor()
knn.fit(X_train_sc,y_train)

cv_score=cross_val_score(knn,X_train_sc,y_train).mean()
train_score=knn.score(X_train_sc,y_train)
test_score=knn.score(X_test_sc,y_test)

print(f'KNN Cross Val Score: {cv_score}.')
print(f'KNN Train Score: {train_score}.')
print(f'KNN Test Score: {test_score}.')

KNN Cross Val Score: 0.8422098950091765.
KNN Train Score: 0.9002674187342772.
KNN Test Score: 0.8629037027630089.


#### Random Forest

In [44]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)

cv_score=cross_val_score(rf,X_train,y_train).mean()
train_score=rf.score(X_train,y_train)
test_score=rf.score(X_test,y_test)

print(f'Random Forest Cross Val Score: {cv_score}.')
print(f'Random Forest Train Score: {train_score}.')
print(f'Random Forest Test Score: {test_score}.')

Random Forest Cross Val Score: 0.6448964753387656.
Random Forest Train Score: 0.954739103436734.
Random Forest Test Score: 0.7162105509399455.


#### Adaboost

In [45]:
# Non scaled data
ada=AdaBoostRegressor()
ada.fit(X_train,y_train)

cv_score=cross_val_score(ada,X_train,y_train).mean()
train_score=ada.score(X_train,y_train)
test_score=ada.score(X_test,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.7079059951319016.
Adaboost Train Score: 0.8990802654462605.
Adaboost Test Score: 0.7837991192573996.


In [46]:
# Scaled data
ada=AdaBoostRegressor()
ada.fit(X_train_sc,y_train)

cv_score=cross_val_score(ada,X_train_sc,y_train).mean()
train_score=ada.score(X_train_sc,y_train)
test_score=ada.score(X_test_sc,y_test)

print(f'Adaboost Cross Val Score: {cv_score}.')
print(f'Adaboost Train Score: {train_score}.')
print(f'Adaboost Test Score: {test_score}.')

Adaboost Cross Val Score: 0.6739652364249307.
Adaboost Train Score: 0.9042270523041877.
Adaboost Test Score: 0.7797384282388009.
