In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# use pandas and import the avocado.csv dataset
import pandas as pd
avocado = pd.read_csv('../input/avocado-prices/avocado.csv')

In [None]:
# use the head() method to show your dataset
avocado.head()

In [None]:
avocado = avocado.drop('Unnamed: 0', axis=1)

In [None]:
#Feature Extraction
#import numpy
import numpy as np
avocado['Date'] = pd.to_datetime(avocado['Date'])
avocado['month'] = avocado['Date'].dt.month

conditions = [
            (avocado['month'].between(3,5,inclusive=True)),
           (avocado['month'].between(6,8,inclusive=True)),
           (avocado['month'].between(9,11, inclusive=True)),
           (avocado['month'].between(1,2, inclusive=True)),
            (avocado['month'].between(12,12, inclusive=True))
        ]

values = [0,1,2,3,3]
#spring = 0, summer = 1, fall = 2, winter = 3
avocado['seasons'] = np.select(conditions, values)

In [None]:
avocado

In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
avocado.describe()

In [None]:
avocado["seasons"].value_counts()

In [None]:
avocado["region"].value_counts()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
avocado["Total Volume"].hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
import plotly.express as px
#conventional = avocado[avocado['type'] == 'conventional']
#organic = avocado[avocado['type'] == 'organic']

fig = px.histogram(avocado, x='AveragePrice', color='type',
                   marginal='box', # or violin, rug
                   hover_data=avocado.columns)


fig.show()

In [None]:
fig = px.box(avocado, x='region', y='AveragePrice')
fig.show()

In [None]:
import seaborn as sns
corr = avocado.corr()
corr

f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 10))
ax.set_title('Correlation Matrix', fontsize=16)

sns.heatmap(corr, vmin=-1, vmax=1, cmap='viridis', annot=True)

In [None]:
avocado.isnull().any()

In [None]:
avocado.duplicated().any()

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
# use sckitlearn library and split your dataset into train_set and test_set
# IMPORTANT --> consider 25% of your dataset as the test_set (random_state=42)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(avocado, test_size=0.33, random_state=42)

In [None]:
train_set

In [None]:
# split inputs and output (AveragePrice)
avocado_labels = train_set['AveragePrice'].copy()
avocado_tr = train_set.drop('AveragePrice', axis=1)

In [None]:
avocado_tr

In [None]:
# split numerical and categorical columns
avocado_num = avocado_tr.drop(['type','Date','region'], axis=1)
avocado_cat = avocado_tr[['type']]
avocado_region = avocado_tr[['region']]

In [None]:
avocado_num

In [None]:
avocado_cat

In [None]:
avocado_region

In [None]:
# generate numerical pipeline to take care of missing values and scale the dataset
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [None]:
# generate full pipeline to take care of numerical and categorical data (use OneHotEncoder)
num_attribs = list(avocado_num)
cat_attribs = ["type"]
region_attrib = list(avocado_region)

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
        ("matrix", OneHotEncoder(), region_attrib),
    ])

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
# apply full pipeline to training set and prepare the data for training ML model
avocado_tr_prepared = full_pipeline.fit_transform(avocado_tr)

In [None]:
avocado_tr_prepared.toarray()

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
# use prepared data and output and train a linear regression model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(avocado_tr_prepared, avocado_labels)

In [None]:
# use prepared data and output and train a second degree polynomial regression model
from sklearn.preprocessing import PolynomialFeatures

poly_2_features = PolynomialFeatures(degree=2, include_bias=False)
avocado_prepared_poly_2 = poly_2_features.fit_transform(avocado_tr_prepared)

poly_reg_2 = LinearRegression()
poly_reg_2.fit(avocado_prepared_poly_2, avocado_labels)

In [None]:
##################### use prepared data and output and train a third degree polynomial regression model#################

#poly_3_features = PolynomialFeatures(degree=3, include_bias=False)

#avocado_prepared_poly_3 = poly_3_features.fit_transform(avocado_tr_prepared)

#poly_reg_3 = LinearRegression()
#poly_reg_3.fit(avocado_prepared_poly_3, avocado_labels)

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
avocado_tr_predictions = lin_reg.predict(avocado_tr_prepared)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

lin_mse = mean_squared_error(avocado_labels, avocado_tr_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# prepare the test set to test the trained model
avocado_tr_prepared_poly_2 = poly_2_features.fit_transform(avocado_tr_prepared)

In [None]:
# use trained second degree regression model and perform prediction on the prepared test set
avocado_tr_predictions_2 = poly_reg_2.predict(avocado_tr_prepared_poly_2)

In [None]:
# calculate rmse for tested model
poly_2_mse = mean_squared_error(avocado_labels, avocado_tr_predictions_2)
poly_2_rmse = np.sqrt(poly_2_mse)
poly_2_rmse

In [None]:
#################### prepare the test set to test the trained model#####################################################

#avocado_tr_prepared_poly_3 = poly_3_features.fit_transform(avocado_tr_prepared)

In [None]:
#################### use trained third degree regression model and perform prediction on the prepared test set##########

#avocado_tr_predictions_3 = poly_reg_3.predict(avocado_tr_prepared_poly_3)

In [None]:
#################### calculate rmse for tested model####################################################################
#poly_3_mse = mean_squared_error(avocado_labels, avocado_tr_predictions_3)
#poly_3_rmse = np.sqrt(poly_3_mse)
#poly_3_rmse

In [None]:
# train a second degree ridge regression on prepared data
from sklearn.linear_model import Ridge
ridge_reg_2 = Ridge(alpha=1.2, solver="cholesky", fit_intercept=False)
ridge_reg_2.fit(avocado_prepared_poly_2, avocado_labels)

In [None]:
# use second degree ridge regression and do prediction on prepared test set
avocado_tr_ridge_predictions_2 = ridge_reg_2.predict(avocado_tr_prepared_poly_2)

In [None]:
# calculate RMSE
# if the RMSE is not satisfying, go back, use different alpha (between 0 and 1) and try to find the best alpha (alpha which result in smallest RMSE)
ridge_poly_2_mse = mean_squared_error(avocado_labels, avocado_tr_ridge_predictions_2)
ridge_poly_2_rmse = np.sqrt(ridge_poly_2_mse)
ridge_poly_2_rmse

In [None]:
# train a second degree elastic net on prepared data
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.001, l1_ratio=0.1)
elastic_net.fit(avocado_prepared_poly_2, avocado_labels)

In [None]:
# use second degree elastic net and do prediction on prepared test set
avocado_tr_elastic_predictions_2 = elastic_net.predict(avocado_tr_prepared_poly_2)

In [None]:
# calculate RMSE
# if the RMSE is not satisfying, go back, use different alpha (between 0 and 1) and l1_ratio (between 0 and 1) and try to find the best alpha/l1_ratio (alpha/l1_ratio which result in smallest RMSE)
elastic_poly_2_mse = mean_squared_error(avocado_labels, avocado_tr_elastic_predictions_2)
elastic_poly_2_rmse = np.sqrt(elastic_poly_2_mse)
elastic_poly_2_rmse

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
# prepare the test set to test the trained model
avocado_test = test_set.drop("AveragePrice", axis=1)
avocado_test_labels = test_set["AveragePrice"].copy()

avocado_test_prepared = full_pipeline.fit_transform(avocado_test)

In [None]:
# use trained linear regression model and perform prediction on the prepared test set
avocado_test_predictions = lin_reg.predict(avocado_test_prepared)

In [None]:
# calculate rmse for tested model
from sklearn.metrics import mean_squared_error
import numpy as np

lin_mse = mean_squared_error(avocado_test_labels, avocado_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# prepare the test set to test the trained model
avocado_test_prepared_poly_2 = poly_2_features.fit_transform(avocado_test_prepared)

In [None]:
# use trained second degree regression model and perform prediction on the prepared test set
avocado_test_predictions_2 = poly_reg_2.predict(avocado_test_prepared_poly_2)

In [None]:
# calculate rmse for tested model
poly_2_mse = mean_squared_error(avocado_test_labels, avocado_test_predictions_2)
poly_2_rmse = np.sqrt(poly_2_mse)
poly_2_rmse

In [None]:
############ prepare the test set to test the trained model#############################################################

#avocado_test_prepared_poly_3 = poly_3_features.fit_transform(avocado_test_prepared)

In [None]:
################# use trained third degree regression model and perform prediction on the prepared test set#############

#avocado_test_predictions_3 = poly_reg_3.predict(avocado_test_prepared_poly_3)

In [None]:
################# calculate rmse for tested model#######################################################################

#poly_3_mse = mean_squared_error(avocado_test_labels, avocado_test_predictions_3)
#poly_3_rmse = np.sqrt(poly_3_mse)
#poly_3_rmse

In [None]:
# train a second degree ridge regression on prepared data
from sklearn.linear_model import Ridge
ridge_reg_2 = Ridge(alpha=0.8, solver="cholesky", fit_intercept=False)
ridge_reg_2.fit(avocado_prepared_poly_2, avocado_labels)

In [None]:
# use second degree ridge regression and do prediction on prepared test set
avocado_test_ridge_predictions_2 = ridge_reg_2.predict(avocado_test_prepared_poly_2)

In [None]:
# calculate RMSE
# if the RMSE is not satisfying, go back, use different alpha (between 0 and 1) and try to find the best alpha (alpha which result in smallest RMSE)
ridge_poly_2_mse = mean_squared_error(avocado_test_labels, avocado_test_ridge_predictions_2)
ridge_poly_2_rmse = np.sqrt(ridge_poly_2_mse)
ridge_poly_2_rmse

In [None]:
# train a second degree elastic net on prepared data
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.001, l1_ratio=0.1)
elastic_net.fit(avocado_prepared_poly_2, avocado_labels)

In [None]:
# use second degree elastic net and do prediction on prepared test set
avocado_test_elastic_predictions_2 = elastic_net.predict(avocado_test_prepared_poly_2)

In [None]:
# calculate RMSE
# if the RMSE is not satisfying, go back, use different alpha (between 0 and 1) and l1_ratio (between 0 and 1) and try to find the best alpha/l1_ratio (alpha/l1_ratio which result in smallest RMSE)
elastic_poly_2_mse = mean_squared_error(avocado_test_labels, avocado_test_elastic_predictions_2)
elastic_poly_2_rmse = np.sqrt(elastic_poly_2_mse)
elastic_poly_2_rmse

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=500, random_state=42)
forest_reg.fit(avocado_tr_prepared, avocado_labels)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

avocado_predictions = forest_reg.predict(avocado_tr_prepared)
forest_mse = mean_squared_error(avocado_labels, avocado_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
avocado_test_predictions = forest_reg.predict(avocado_test_prepared)
forest_mse = mean_squared_error(avocado_test_labels, avocado_test_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard Deviation:', scores.std())

In [None]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, avocado_tr_prepared, avocado_labels,
                                scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    #### try 12 (3×4) combinations of hyperparameters
    #{'n_estimators': [10, 100, 300], 'max_features': [2, 4, 6, 8]},
    #### then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [10, 100], 'max_features': [30,40,70]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
#### train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(avocado_tr_prepared, avocado_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importance = grid_search.best_estimator_.feature_importances_

In [None]:
feature_importance

In [None]:
#####I was hoping that the regions that encompasses the cities would have a greater correlation to the data
########This might have helped me extract a new feature from the region as I imagine this may be the cause of
########the Random Tree Forest Overfitting...
##########
#cat_encoder = full_pipeline.named_transformers_["cat"]
#cat_one_hot_attribs = list(cat_encoder.categories_[0])
#matrix_encoder = full_pipeline.named_transformers_["matrix"]
#matrix_one_hot_attribs = list(matrix_encoder.categories_[0])
#attributes = num_attribs + cat_one_hot_attribs + matrix_one_hot_attribs
#sorted(zip(feature_importance, attributes), reverse=True)

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard Deviation:', scores.std())

In [None]:
from sklearn.model_selection import cross_val_score

ridge_scores = cross_val_score(ridge_reg_2, avocado_tr_prepared, avocado_labels,
                                scoring="neg_mean_squared_error", cv=20)
ridge_rmse_scores = np.sqrt(-ridge_scores)
display_scores(ridge_rmse_scores)

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
avo = pd.read_csv("../input/avocado-prices/avocado.csv")

In [None]:
avo = avocado[avocado['region'] == 'TotalUS'].drop(['Date','region'], axis=1)
avo = avo[avocado['type'] == 'organic']

In [None]:
from scipy import stats

X_lin = avo['year'].reset_index(drop=True)
y_lin = avo['AveragePrice'].reset_index(drop=True)


slope, intercept, r, p, std_err = stats.linregress(X_lin, y_lin) # scipy

def prediction(x):
  return slope * x + intercept

name = 'Avg. Avocado price (organic) in 2019'
md = list(map(prediction, X_lin)) # scipy

X_pred_lin = 2019
y_pred_lin = prediction(X_pred_lin)

print('Predicted avicado price in Entire US in 2019 is: %f USD' % y_pred_lin)

X_lin2 = X_lin.append(pd.Series(X_pred_lin))
y_lin2 = y_lin.append(pd.Series(y_pred_lin))
md2 = list(map(prediction, X_lin2)) 

plt.scatter(X_lin2, y_lin2) # Scatter Plot
plt.plot(X_lin2, md2, color='green')
plt.xticks(np.arange(min(X_lin2), max(X_lin2+1), 1.0))
plt.show()

In [None]:
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################
#########################################################################################################################

In [None]:
#from sklearn.ensemble import RandomForestClassifier

#rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
#rnd_clf.fit(avocado_tr_prepared, avocado_labels)

In [None]:
##from sklearn.model_selection import cross_val_predict
#from sklearn.metrics import confusion_matrix

##y_prediction = cross_val_predict(rnd_clf, X, y, cv=3)
##conf_mx = confusion_matrix(y, y_prediction)
##conf_mx