In [None]:
#upload the packages

import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2

avocado = pd.read_csv('../input/avocado-prices/avocado.csv')





In [None]:
#look at the data
avocado.head()
#Unnamed: 0 and Date are not imperative - we can drop them 

In [None]:
avocado = avocado.drop(['Unnamed: 0', 'Date'], axis=1)

In [None]:
#the downstream variable is the price. Lets look at the distribution for price. 

import matplotlib.pyplot as plt

avocado['AveragePrice'].hist()

In [None]:
#there are organic and convential avocados - lets compare the distribution of the other variables 

avocado.groupby('type').hist(figsize=(20,10))
plt.show()

In [None]:
#Check for NA values
avocado.isnull().sum()
#there are no NA values

In [None]:
#describe the int64 columns
avocado.describe()

In [None]:
#results: organic corresponds to a higher price than conventional
#do a LRT for the different regions

In [None]:
#distribution of the regions

pd.value_counts(avocado['region']).plot(kind="bar")
#there is an even number of regions

In [None]:
#Q: Is there a difference between avocado prices? 
#to answer, we will perform a LRT 
#using mixed models are accounts for potential outliers

import statsmodels.api as sm
import statsmodels.formula.api as smf

model2 = smf.mixedlm(formula="AveragePrice ~ region", groups = avocado['region'], data=avocado).fit()

import warnings
warnings.filterwarnings('ignore') #there is a convergence warning, we ignore this 

print(model2.summary())




In [None]:
#intercept only model - the null model 

model1 = smf.ols(formula="AveragePrice ~ 1", groups = avocado['region'], data=avocado).fit()

print(model1.summary())


def lrt(null, alt):
    return(2*(null-alt))

LR=lrt(-9294,-7614) 
stats.chi2.pdf(LR, 1)


In [None]:
#formulate the LRT 

def lrt(null, alt):
    return(2*(null-alt))

LR=lrt(-9294,-7776.0681 ) 
print("The p-value for the LRT between the region mixed model and the intercept-only model is "  + str(stats.chi2.pdf(LR, 1)))



#there is a statistical difference between the region and non-region models 
#suggesting that region does contribute to the price

In [None]:
#next - is there a difference between the regions and covariates and region-only model? 

#change column names so all are one word
avocado = avocado.rename(columns={'Total Volume': 'Volume','Total Bags': 'TotalBags', 'Small Bags': 'SmallBags', 'Large Bags' : 'LargeBags', 'XLarge Bags' : 'XLargeBags'})

model3 = smf.mixedlm(formula="AveragePrice ~ region + type + Volume + year + TotalBags + SmallBags + LargeBags + XLargeBags", groups = avocado['region'], data=avocado).fit()

print(model3.summary())



In [None]:
LR2=lrt( -7776.0681, -2105.8988 ) 
print("The p-value for the LRT between the region mixed model and the region+covariates mixed model is "  + str(stats.chi2.pdf(LR2, 1)))





In [None]:
#for a random forest, we need to create dummy variables for strings 

organic_conv = pd.get_dummies(avocado['type'])
region = pd.get_dummies(avocado['region'])
avocado = avocado.join(organic_conv)
avocado = avocado.join(region)
#remove the string columns
avocado = avocado.drop('region', axis=1)
avocado = avocado.drop('type', axis=1)
print(avocado.shape)
list(avocado)

In [None]:
#now we change the data type to array 
y = np.array(avocado['AveragePrice'])
x = avocado.drop('AveragePrice', axis=1)
x_names = list(x.columns)
#print(x_names)
x = np.array(x)

In [None]:
#now we split the data into training and testing sets

from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(x, y, test_size = 0.25)

#25% of the data is testing data, while we are training on 75%

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 10, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [None]:
#predict on the test set

predictions = rf.predict(test_features)


errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2) ,'$')


In [None]:
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
#results - Prices for avocado have been predicted with 91.95% accuracy using random forest .