## Common imports
Import common python libraries

In [None]:
import numpy as np
import pandas as pd

## Load data

We load the data for avocado pricing.

In [None]:
data = pd.read_csv("../input/avocado.csv")
data.head()

## Selecting feature and target columns

In [None]:
# make sure there aren't any missing values, if there are we need to use an Imputer
assert [col for col in data.columns if data[col].isnull().any()] == []

We will first create a model using the numerical data. Later we will add the categorical data to a different model.

In [None]:
dates = [(int(mm), int(dd)) for mm, dd in [d.rsplit('-')[1:] for d in data['Date']]] 

data['month'] = pd.Series([mm[0] for mm in dates])
data['day'] = pd.Series([dd[1] for dd in dates])

features_num = ['Total Volume', '4046', '4225', '4770', 'Total Bags',
                'Small Bags', 'Large Bags', 'XLarge Bags', 'year',
                'month', 'day']
target = ['AveragePrice']

X = data[features_num].values
y = data[target].values.ravel()


## Time to split train and test data
Xtrain will be 80% of data and Xtest will be 20% of data

In [None]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

## Train a Random Forest Regressor model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

clf = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=1, n_estimators=150))

scores = cross_val_score(clf, Xtrain, ytrain, cv=3, n_jobs=-1)
print(f"{round(np.mean(scores),3)*100}% accuracy")

clf.fit(Xtrain,ytrain)

print(mean_squared_error(y_pred=clf.predict(Xtest), y_true=ytest))

Our RandomForestRegressor has a mse of 0.0214, and this is without using the categorical part of the data.

## Train a model with XGBoost

In [None]:
from xgboost import XGBRegressor

clf = make_pipeline(StandardScaler(), XGBRegressor(n_estimators=1000, learning_rate=0.2, early_stopping_rounds=5))

scores = cross_val_score(clf, Xtrain, ytrain, cv=3, n_jobs=-1)
print(f"{round(np.mean(scores),3)*100}% accuracy")

clf.fit(Xtrain, ytrain)

print(mean_squared_error(y_pred=clf.predict(Xtest),y_true=ytest))

This model uses XGBoost and performs worse than with Random forests. Its mse is 0.0286.

## We will now use the categorical data
Let's see if we get better performance

In [None]:
data_with_categorical= pd.get_dummies(data.drop(columns=['Unnamed: 0', 'Date'], axis=1))
X = data_with_categorical.drop(columns='AveragePrice', axis=1).values
y = data_with_categorical['AveragePrice'].values.ravel()

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

clf = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=1, n_estimators=100))

scores = cross_val_score(clf, Xtrain, ytrain, cv=3, n_jobs=-1)
print(f"{round(np.mean(scores),3)*100}% accuracy")

clf.fit(Xtrain,ytrain)

print(mean_squared_error(y_pred=clf.predict(Xtest), y_true=ytest))

Our new Random forest regressor model has a mse of 0.0153, that's a 29% drop.

## What happens with XGBoost now?

In [None]:
clf = make_pipeline(StandardScaler(), XGBRegressor(n_estimators=1000, learning_rate=0.5, early_stopping_rounds=5))

scores = cross_val_score(clf, Xtrain, ytrain, cv=3, n_jobs=-1)
print(f"{round(np.mean(scores),3)*100}% accuracy")

clf.fit(Xtrain, ytrain)

print(mean_squared_error(y_pred=clf.predict(Xtest),y_true=ytest))

Still, our Random forests model performs better. This XGBoost model now has a mse of 0.0159, a 44% drop.
