In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = '/kaggle/input/avocado-prices/avocado.csv'

df = pd.read_csv(path)

In [None]:
df.head(5)

In [None]:
#dropping the column with the unnamed column
df.drop(df[['Unnamed: 0']],axis=1,inplace=True)

In [None]:
#import other packages
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df.describe()

In [None]:
df.region.unique()

In [None]:
#missing values
df.isnull().sum()

There are no missing values.

# Data Distribution

In [None]:
#average price by avocado type
sns.boxplot(x='type',y='AveragePrice',data=df)

In [None]:
#Price by year
sns.boxplot(x='year',y='AveragePrice',data=df)

In [None]:
#price versus sales volume
plt.scatter(x='Total Volume',y='AveragePrice',data=df,color='red')

# Avocado Sales Volume

*Distribution of sales volume by PLU type and distribution of sales volume by bag size*

In [None]:
label1 = 'PLU4046','PLU4225','PLU4770'
label2 = 'Small Bags','Large Bags','XL Bags'

PLU4046 = int(df['4046'].sum())
PLU4225 = int(df['4225'].sum())
PLU4770 = int(df['4770'].sum())

small = int(df['Small Bags'].sum())
large = int(df['Large Bags'].sum())
xl = int(df['XLarge Bags'].sum())

plu_count = [PLU4046,PLU4225,PLU4770]
bags = [small,large,xl]

In [None]:
#make a circle diagram
# Creating plot 
plt.figure(figsize=(10,10))
plt.pie(plu_count, autopct='%1.1f%%')
plt.legend(label1, loc = "upper right")
plt.tight_layout()
plt.title("Avocado Total Avocados sold PLU")
  
# show plot 
plt.show() 

Avocados of PLU4770 type have a small sales volume.

In [None]:
#make a circle diagram
# Creating plot 
plt.figure(figsize=(10,10))
plt.pie(bags, autopct='%1.1f%%')
plt.legend(label2, loc = "upper right")
plt.tight_layout()
plt.title("Avocado Total Avocados sold by Bag Size")
  
# show plot 
plt.show() 

Large share of small bags are sold.

# Avocado Revenue 

In [None]:
#Revenue each year
df['revenue_daily'] = df['AveragePrice']*df['Total Volume']
df['revenue_plu4046'] = df['AveragePrice']*df['4046']
df['revenue_plu4225'] = df['AveragePrice']*df['4225']
df['revenue_plu4770'] = df['AveragePrice']*df['4770']

In [None]:
df.head(5)

In [None]:
#Total Revenue each year
sns.barplot(x='year', y='revenue_daily', estimator = sum, data=df)

In [None]:
#Revenue by type and year
sns.barplot(x='year', y='revenue_daily', hue='type', estimator = sum, data=df)

In [None]:
#data barplot
revenue_year = df.groupby(['year']).revenue_daily.sum().reset_index() 
revenue_year

In [None]:
#Revenue by year plot
sns.barplot(x='year', y='revenue_daily', data=revenue_year,color='blue')
plt.title("Total Revenue Avocado Sales")
plt.ylabel("Revenue")
plt.xlabel("Year")


In [None]:
#Revenue by year and PLU
revenue_year_plu4046 = df.groupby(['year']).revenue_plu4046.sum().reset_index() 
revenue_year_plu4225 = df.groupby(['year']).revenue_plu4225.sum().reset_index() 
revenue_year_plu4770 = df.groupby(['year']).revenue_plu4770.sum().reset_index() 

revenue_year_plu4046['PLU'] = "4046"
revenue_year_plu4225['PLU'] = "4225"
revenue_year_plu4770['PLU'] = "4770"

revenue_year_plu4046 = revenue_year_plu4046.rename(columns={'revenue_plu4046':'revenue'})
revenue_year_plu4225 = revenue_year_plu4225.rename(columns={'revenue_plu4225':'revenue'})
revenue_year_plu4770 = revenue_year_plu4770.rename(columns={'revenue_plu4770':'revenue'})

frames = [revenue_year_plu4046,revenue_year_plu4225,revenue_year_plu4770]

results = pd.concat(frames)
results

In [None]:
#Bar Plot
sns.barplot(x='year', y='revenue', hue='PLU', data=results)
plt.title("Total Revenue Avocado Sales")
plt.ylabel("Revenue")
plt.xlabel("Year")

# Polynomial Regression

In [None]:
#correlation of variables: correlation diagram
fig, ax = plt.subplots(figsize=(10,10))
corr = df.corr()

ax = sns.heatmap(corr,vmin=-1, vmax=1, center=0,cmap=sns.diverging_palette(20, 220, n=200),square=True)

ax.set_xticklabels(ax.get_xticklabels(),rotation=45,horizontalalignment='right')

In [None]:
#polynomial regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

import sklearn.pipeline as pp

from sklearn import linear_model

In [None]:
avocado_new = df[['AveragePrice','Total Volume']]
avocado_new.head()

How are average price and volume associated?

In [None]:
sns.boxplot(avocado_new['AveragePrice'])

In [None]:
#create a regplot to see the correlation
sns.regplot(x='Total Volume',y='AveragePrice',data=avocado_new)

You expect a linear relationship between the variables volume and price, but the data point poorly fit a linear model.

In [None]:
avocado_new.shape

In [None]:
feature = avocado_new[['Total Volume']]
x= feature
y = avocado_new['AveragePrice']

In [None]:
#split the data in a train and test set
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=4)
print ('The length of the train set is:', x_train.shape,  y_train.shape)
print ('The lenth of the test set equals:', x_test.shape,  y_test.shape)

In [None]:
#Polynomial transformation to the second order
y_poly=PolynomialFeatures(degree=2)
x_train_poly=y_poly.fit_transform(x_train)   #this is for the training data
x_test_poly=y_poly.fit_transform(x_test)   # this is for the test data

In [None]:
model_avocado_price = linear_model.LinearRegression()
model_avocado_price.fit(x_train_poly, y_train)


In [None]:
#Model evaluation metric
#R-squared using the test data
r_squared_polynomial = model_avocado_price.score(x_test_poly, y_test)
print("The R-squared equals:")
r_squared_polynomial

In [None]:
#The Model
# add a constant
x2 = sm.add_constant(x_train)

pg_stats_model = sm.OLS(y_train, x2)

# fit the model
results = pg_stats_model.fit()

print(results.summary())

The model is poorly estimated.

**This is the end of this analysis.**