In [None]:
# import packages
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.svm import SVR

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
# read csv_file
df = pd.read_csv('../input/avocado-prices/avocado.csv')

In [None]:
data = df.copy()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# change type's columns
columns = ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']

for i in columns:
    data[i] = data[i].astype('int')

In [None]:
data.info()

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
data['month'] = data['Date'].dt.month

In [None]:
data['day'] = data['Date'].dt.day

In [None]:
data = data.drop(['Date', 'Unnamed: 0'], axis = 1)

In [None]:
data

In [None]:
rename_columns = list(data.columns)
rename_columns = [x.lower() for x in rename_columns]
data.columns = rename_columns
data = data.rename(columns = lambda x: x.replace(' ', '_'))        

In [None]:
def label_encoder_pre(data):
    if data.dtype == 'object':
        data = LabelEncoder().fit_transform(data)
    return data

In [None]:
scatter = go.Scatter(x = df.groupby('Date').mean().index, y = df.groupby('Date').mean().AveragePrice)
layout = go.Layout(title = 'Time series plot for mean daily prices for all regions', xaxis ={'title':'Date'}, yaxis = {'title':'Prices'})
figure = go.Figure(data = [scatter], layout = layout)
iplot(figure)

In [None]:
fig = plt.figure(figsize = (20, 10))
ax = fig.subplots(2, 2)
sns.set()
sns.distplot(data['4046'], ax = ax[0, 0])
ax[0, 0].title.set_text('The distribution of the 4046 volume')
sns.distplot(data['4225'], ax = ax[1, 0])
ax[1, 0].title.set_text('The distribution of the 4225 volume')
sns.distplot(data['4770'], ax = ax[0, 1])
ax[0, 1].title.set_text('The distribution of the 4770 volume')
sns.distplot(data['total_volume'], ax = ax[1, 1])
ax[1, 1].title.set_text('The distribution of the total volume')

In [None]:
fig1 = plt.figure(figsize = (20, 10))
ax1 = fig1.subplots(2, 2)
sns.set()
sns.distplot(data['small_bags'], ax = ax1[0, 0])
ax1[0, 0].title.set_text('The distribution of the small bags')
sns.distplot(data['large_bags'], ax = ax1[1, 0])
ax1[1, 0].title.set_text('The distribution of the large bags')
sns.distplot(data['xlarge_bags'], ax = ax1[0, 1])
ax1[0, 1].title.set_text('The distribution of the xlarge bags')
sns.distplot(data['total_bags'], ax = ax1[1, 1])
ax1[1, 1].title.set_text('The distribution of the total bags')

In [None]:
plt.figure(figsize = (12, 8))
sns.set()
sns.distplot(data.averageprice, color = 'lightcoral')
plt.xlabel('average_price')
plt.ylabel('frequency')
plt.title('The distribution of the avocado prices')

In [None]:
plt.figure(figsize = (12, 8))
sns.set()
sns.countplot(y = data.year, hue = data.type)
plt.grid()
plt.title('The part of avocado who conventional depending of the year')

In [None]:
plt.figure(figsize = (14, 10))
sns.set()
sns.countplot(y = data.month, hue = data.type)
plt.grid()
plt.title('The part of avocado who conventional depending of the month')

In [None]:
plt.figure(figsize = (12, 18))
ax = sns.boxplot(y = 'region', x = 'averageprice', data = data)
plt.title('The boxplot of the region', fontsize = 16)

In [None]:
plt.figure(figsize = (12, 8))
sns.heatmap(data.corr(), annot = True)
plt.title('The correlation matrix', fontsize = 16)

In [None]:
data = data.apply(lambda x: label_encoder_pre(x))

In [None]:
data = pd.get_dummies(data, columns = ['year', 'type', 'region'], drop_first = True)

In [None]:
# add polynomial features
def add_polynomial_features(frame, poly_degree=2, interaction=False):
    poly = PolynomialFeatures(degree = poly_degree, interaction_only = interaction, include_bias = False)
    poly_features = poly.fit_transform(frame[['total_volume', 'total_bags']])
    df_poly = pd.DataFrame(poly_features, columns = poly.get_feature_names())
    return pd.concat([frame, df_poly.drop(['x0'], axis=1)], axis=1)
#data = add_polynomial_features(data, 2, False)

In [None]:
X = data.drop(['averageprice'], axis = 1)
y = data['averageprice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.25, random_state = 12345)

In [None]:
standard_scaler_pre = []
for i in data.columns:
    if data[i].dtype == 'int32' or data[i].dtype == 'int64':
        standard_scaler_pre += [i]
ss = StandardScaler()
X_train[standard_scaler_pre] = ss.fit_transform(X_train[standard_scaler_pre])
X_valid[standard_scaler_pre] = ss.transform(X_valid[standard_scaler_pre])

In [None]:
regressors = [['ElasticNet', ElasticNetCV(random_state = 12345, cv = 10)],
             ['SVR', SVR()],
             ['XGBRegressor', XGBRegressor(random_state = 12345)],
             ['LGBMRegressor', LGBMRegressor(random_state = 12345),],
             ['CatBoostRegressor', CatBoostRegressor(random_state = 12345, verbose = 1000)]]

print('Mean absolute error results:')
result_regressors = dict()
for name, regressor in regressors:
    reg = regressor
    reg.fit(X_train, y_train)
    mae = cross_val_score(reg, X_valid, y_valid, cv = 10, n_jobs =-1, scoring = 'neg_mean_absolute_error')
    print(name, -np.mean(mae))
    result_regressors[name] = -np.mean(mae)

In [None]:
result_regressors_df = pd.DataFrame(result_regressors.items(), columns = ['model', 'mean_absolute_error'])

In [None]:
result_regressors_df

In [None]:
scatter = go.Scatter(x = result_regressors_df['model'], y = result_regressors_df['mean_absolute_error'])
layout = go.Layout(title = 'Mean absolute error for all models', xaxis ={'title':'model'}, yaxis = {'title':'mean_absolute_error'})
figure = go.Figure(data = [scatter], layout = layout)
iplot(figure)