# Avocado Prices in the United States (2015-18)

Analysing the prices by state and type, and price changes throughout the period and training a regression model, ready to predict the forecast for the future of avocado prices.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        print("Import complete")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the data

In [None]:
# Loading the dataset
avocado_filepath = '../input/avocado-prices/avocado.csv'
df = pd.read_csv(avocado_filepath, parse_dates=['Date'])

In [None]:
df.head()

In [None]:
df.info()

# Preprocessing the data

In [None]:
df = df.rename(columns={'4046':'Small/Med Hass', '4225':'Large Hass', '4770':'Extra Large Hass'})

In [None]:
df.columns

In [None]:
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df.isnull().any()

In [None]:
print(df['type'].unique(), "\n")
print(df['type'].value_counts())

In [None]:
df['region'].unique()

# Exploratory Data Analysis (EDA)

In [None]:
import plotly.express as px

fig = px.histogram(df, x='AveragePrice', color='type', marginal='rug', hover_data=df.columns)
fig.show()

In [None]:
fig = px.bar(df, x='AveragePrice', y='region', color='type', hover_data=df.columns, height=1000)
fig.show()

In [None]:
df_date_sorted = df.sort_values(by='Date')

df3 = df_date_sorted.loc[df_date_sorted.type == 'conventional']
df4 = df_date_sorted.loc[df_date_sorted.type == 'organic']

In [None]:
df3_alt = df3.groupby('Date').sum()
df4_alt = df4.groupby('Date').sum()

In [None]:
fig = px.line(df3_alt, x=df3_alt.index, y='Total Bags',title=
              'Sales of conventional avocados across the United States (2015-2018)')
fig.show()

In [None]:
fig = px.line(df4_alt, x=df4_alt.index, y='Total Bags', title=
              'Sales of organic avocados across the United States (2015-2018)')
fig.show()

In [None]:
df_sf = df_date_sorted.loc[df_date_sorted.region == 'SanFrancisco']
df_htsprng = df_date_sorted.loc[df_date_sorted.region == 'HartfordSpringfield']
df_ny = df_date_sorted.loc[df_date_sorted.region == 'NewYork']

df_sf = df_sf.groupby('Date').sum()
df_htsprng = df_htsprng.groupby('Date').sum()
df_ny = df_ny.groupby('Date').sum()

In [None]:
fig = px.line(df_sf, x=df_sf.index, y='Total Bags', title=
              'Total sales of avocados in San Francisco, CA, US (2015-2018)')
fig.show()

In [None]:
fig = px.line(df_htsprng, x=df_htsprng.index, y='Total Bags', title=
              'Total sales of avocados in the Hartford-Springfield area, CT/MA, US (2015-2018)')
fig.show()

In [None]:
fig = px.line(df_ny, x=df_ny.index, y='Total Bags', title=
              'Total sales of avocados in New York, US (2015-2018)')
fig.show()

In [None]:
fig = px.imshow(df.corr())
fig.show()

# Feature selection and engineering

In [None]:
df_copy = df.copy()

df1 = df_copy.drop(['Date', 'AveragePrice'], axis=1)
df2 = df_copy.pop('AveragePrice')

for colname in df1.select_dtypes('object'):
    df1[colname], _ = df1[colname].factorize()
    
discrete_features = df1.dtypes == int
print(discrete_features)

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name='MI Scores', index=df1.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(df1, df2, discrete_features)
mi_scores

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
df1['type'] = label_encoder.fit_transform(df1['type'])

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
df_ohe = pd.get_dummies(data=df1, columns=['region'])


In [None]:
df_ohe

In [None]:
df2

# Training the model

In [None]:
feature_names = ['Total Volume',
                'Total Bags',
                'type',
                'region']

columns_to_drop = ['Small/Med Hass',
                  'Large Hass',
                   'Extra Large Hass',
                  'Small Bags',
                  'Large Bags',
                  'XLarge Bags']

X = df_ohe.drop(columns_to_drop, axis=1)
y = df2

In [None]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: {}  \t\t Mean Absolute Error:  {}".format(max_leaf_nodes, my_mae))

In [None]:
avocado_model = DecisionTreeRegressor(max_leaf_nodes=500, random_state=1)

avocado_model.fit(train_X, train_y)

In [None]:
print("Prediction for the first five entries:")
display(val_X[:5])
print("The predictions are:")
print(avocado_model.predict(val_X))

In [None]:
# Out of sample
predicted_avocado_prices = avocado_model.predict(X)
mean_absolute_error(y, predicted_avocado_prices)

In [None]:
# In sample
val_avocado_prices = avocado_model.predict(val_X)
print("DecisionTreeRegressor")
print("Mean Absolute Error: {}".format(mean_absolute_error(val_y, val_avocado_prices)))
print("r2_score: {}".format(r2_score(val_y, val_avocado_prices)))

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
avocado_preds = forest_model.predict(val_X)
print("RandomForestRegressor")
print("r2_score: {}".format(r2_score(val_y, avocado_preds)))
print("Mean Absolute Error: {}".format(mean_absolute_error(val_y, avocado_preds)))

In [None]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor()
xgb_model.fit(train_X, train_y)

xgb_preds = xgb_model.predict(val_X)

print("XGBoost")
print("r2_score: {}".format(r2_score(val_y, xgb_preds)))
print("Mean Absolute Error: {}".format(mean_absolute_error(xgb_preds, val_y)))


In [None]:
scores = -1 * cross_val_score(forest_model, X, y,
                             cv=5,
                             scoring='neg_mean_absolute_error')

print('MAE scores:', scores)

In [None]:
print("Average MAE score (across experiments):")
print(scores.mean())