In [None]:
#importing libraries
import numpy as np
import pandas as pd

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# for Datetime 
from datetime import datetime

# set theme for visualization
plt.style.use("seaborn-whitegrid")

# Create an array with the colors you want to use
colors = ["#54436B", "#50CB93", "#ED8E7C", "#FFD523", '#00C1D4', '#F8485E']

# Set your custom color palette
sns.set_palette(sns.color_palette(colors))

In [None]:
#loading datasets
df = pd.read_csv("../input/avocado-prices/avocado.csv",index_col=[0])

In [None]:
#first five rows 
df.head()

In [None]:
# last five rows
df.tail()

**Index of last five rows looks weird, let's examine the index of DataFrame**

In [None]:
# shape of dataset.
df.shape

In [None]:
# Total index values musch match with rows of data.
df.index

In [None]:
df.loc[0]

**It has repeated index So, let's reset index.**

In [None]:
# Resetting index of dataframe 'df'
df.reset_index(drop=True, inplace=True)

print("Index Range: ", df.index)
df.loc[0] # should print only one 0th row.

## Columns Discription
* Date - The date of the observation
* AveragePrice - the average price of a single avocado
* type - conventional or organic
* year - the year
* Region - the city or region of the observation
* Total Volume - Total number of avocados sold
* 4046 - Total number of avocados with PLU 4046 sold
* 4225 - Total number of avocados with PLU 4225 sold
* 4770 - Total number of avocados with PLU 4770 sold



In [None]:
# Change the columns names.
df.rename(columns = {"Total Volume":"TotalSales",
                     "4046":"A4046Sales", 
                     "4225":"A4225Sales", 
                     "4770":"A4770Sales",
                     "Total Bags":"TotalBags", 
                     "Small Bags": "SmallBags", 
                     "Large Bags":"LargeBags", 
                     "XLarge Bags":"XLargeBags"}, inplace=True)
df.head()

In [None]:
# summary
df.describe()

In [None]:
df.info()

In [None]:
# datatypes of data columns
df.dtypes

**The Date columns has datatype of 'Object', Let's change into datetime.**

In [None]:
# Converting 'Date' into datetime format.
df['Date'] = pd.to_datetime(df.Date)
df['Date'].head()

In [None]:
# Missing data.
df.isnull().sum()

**No missing values**

In [None]:
# To check if dataframe has any duplicates.
df.duplicated().sum()

**There are no duplicate rows.**

In [None]:
# Check Outliers in Average price.
ax = plt.figure(figsize=(12, 8))

sns.boxplot(x='AveragePrice', data = df)

plt.ylabel("Price")
plt.title("Outliers in Average Price")

plt.show()

In [None]:
# Check Outliers in Average price.
ax = plt.figure(figsize=(12, 8))

sns.boxplot(x='TotalSales', data = df)

plt.title("Outliers")
plt.show()

**There are some Outliers in Average Price**

In [None]:
# Distribution of Data in numeric datatype.
columns = ['AveragePrice', 'TotalSales', 'A4046Sales', 'A4225Sales', 'A4770Sales', 'TotalBags', 'SmallBags', 'LargeBags', 'XLargeBags']

fig,axs = plt.subplots(3, 3, figsize=(18, 12))
l = [(x, y) for x in range(3) for y in range(3)]

for i in range(len(columns)):
    ax1 = sns.distplot(df[columns[i]], ax = axs[l[i]])
    
plt.show()

In [None]:
# Add Yearmonth and Month column.
df['YearMonth'] = df.Date.apply(lambda x: x.strftime(format='%Y-%m'))
df['Month'] = df.Date.apply(lambda x: x.strftime(format='%m'))

In [None]:
df

## Avocado Price Trends

* Yearly trend of average price of Avocado per Unit.
* Monthly trend of Price of Avocado.

In [None]:
# Yearly AveragePrice of Avocado.
data = df.loc[df.year!=2018]
a = pd.DataFrame(data.groupby('year').mean().AveragePrice)

plt.figure(figsize=(12, 8))
sns.pointplot(x=a.index, y = 'AveragePrice', data=a)

plt.ylim(0, 1.75)
plt.title("Average Price of Avocado per Unit over Years.")
plt.show()

**Avocado price increases per year**

In [None]:
# Monthly observation of Avocado Price for each year.
data = df.loc[df.year==2015, ['Month', 'AveragePrice']]
df_2015 = data.groupby(by=['Month']).mean().round(2)


data = df.loc[df.year==2016, ['Month', 'AveragePrice']]
df_2016 = data.groupby(by=['Month']).mean().round(2)


data = df.loc[df.year==2017, ['Month', 'AveragePrice']]
df_2017 = data.groupby(by=['Month']).mean().round(2)


data = df.loc[df.year==2018, ['Month', 'AveragePrice']]
df_2018 = data.groupby(by=['Month']).mean().round(2)

In [None]:
# Visualization for the Monthly Average Price of Avocado

plt.figure(figsize=(14, 9))

plt.plot(df_2015, linewidth=2.2, label='2015 Avocado Price')
plt.plot(df_2016, linewidth=2.2, label='2016 Avocado Price')
plt.plot(df_2017, linewidth=2.2, label='2017 Avocado Price')
plt.plot(df_2018, linewidth=2.2, label='2018 Avocado Price')

plt.xlabel("Months")
plt.ylabel("Avacodo Price")
plt.title("Monthly Average Price per Unit of Avocado.")

plt.legend()
plt.show()

**Single Avocado average price in months from July to Octobar seems higher as compare to other months in each year and lower in months from January to  March**

## Avocado Sales Trends

* Yearly trends for Total Sales and Sales with PLU types.
* Monthly Trends for Total Sales and Sales with PLU types.
* Relation between AveragePrice and Total Sales

In [None]:
# datasets for each year.
df_2015 = df.loc[df.year==2015]
df_2016 = df.loc[df.year==2016]
df_2017 = df.loc[df.year==2017]
df_2018 = df.loc[df.year==2018]

In [None]:
# Yearly sales of Avocado's based on it's unit price.

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(18, 12))
fig.suptitle("Total Sales based on Avocado Price")

axs1 = sns.scatterplot(ax=axs[0, 0], x='AveragePrice', y='TotalSales', data=df_2015, hue='type')
axs1.set_title("Total Sales in 2015")
                       
axs2 = sns.scatterplot(ax=axs[0, 1], x='AveragePrice', y='TotalSales', data=df_2016, hue='type')
axs2.set_title("Total Sales in 2016")


axs3 = sns.scatterplot(ax=axs[1, 0], x='AveragePrice', y='TotalSales', data=df_2017, hue='type')
axs3.set_title("Total Sales in 2017")

axs4 = sns.scatterplot(ax=axs[1, 1], x='AveragePrice', y='TotalSales', data=df_2018, hue='type')
axs4.set_title("Total Sales in 2018")


plt.show()

**We can see that, cutomers tends to buy more when price of avocado is less than 1.5 dollar per unit. But this is for conventional type Avocado. For Organic type, there is same sale rate irrespective of its price in each year.**

In [None]:
df.head()

In [None]:
data = pd.DataFrame(df.groupby('year').mean())

In [None]:
# Yearly observation of Sales with different PLU's code based Average price of avocado.
data = data.loc[:, ['A4046Sales', 'A4225Sales', 'A4770Sales']]
data.plot(kind='bar', figsize=(14, 8))

plt.ylabel("Sales")
plt.title("Yearly Observation of Sales.")

plt.show()

In [None]:
data = df.groupby(by=['Month', 'year']).mean()
data.head()

In [None]:
# Monthly obseveration of Total Sales for each year.
total_sales_data = data['TotalSales']

total_sales_data.unstack().plot(kind='line', figsize=(14, 8))

plt.ylabel("Total Sales")
plt.title("Monthly Observation for Total Sales.")

plt.show()

**There are more sales in months from January to March and May since avocado's price are lowest in those months as compare to other months.**

In [None]:
# Monthly obseveration of Sales for each year.
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(14, 15))

axs1 = data.unstack().plot(kind='line', y='A4046Sales', ax=axs[0])
axs1.set_title("Monthly Observations for 4046 Sales")

axs2 = data.unstack().plot(kind='line', y='A4225Sales', ax=axs[1])
axs2.set_title("Monthly Observations for 4225 Sales")

axs3 = data.unstack().plot(kind='line', y='A4770Sales', ax=axs[2])
axs3.set_title("Monthly Observations for 4770 Sales")


plt.show()

**The Sales for 4225 columns is higher for each year in month of February and May as compare to other months.**

In [None]:
# Yearly sold TotalBags.
data = df.loc[df.year != 2018]
yrSales = pd.DataFrame(data.groupby(by=['year']).sum().TotalBags)
moSales = pd.DataFrame(data.groupby(by=['Month']).sum()[['SmallBags', 'LargeBags', 'XLargeBags']])

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))
ax1 = sns.pointplot(x=yrSales.index, y = 'TotalBags', data=yrSales, ax=axs[0])
ax1.set_title("Year sales observation for Total Bags")

ax2 = sns.lineplot(x=moSales.index, y = 'SmallBags', data=moSales, ax=axs[1], label='SmallBags')
ax2 = sns.lineplot(x=moSales.index, y = 'LargeBags', data=moSales, ax=axs[1], label='LargeBags')
ax2 = sns.lineplot(x=moSales.index, y = 'XLargeBags', data=moSales, ax=axs[1], label='XLargeBags')
ax2.set_ylabel("Sales")
ax2.set_title("Monthly observation for Total Sales based on bag sizes.")

plt.show()


## Region wise trends in Sales and Price

In [None]:
# Region wise trends in Sales and Price.

# Total sales based on Region

region_grpData = pd.DataFrame((df.groupby(by=['region']).sum().TotalSales / df['TotalSales'].sum() *100).round(2))
region_grpData.sort_values(by='TotalSales',ascending=False, inplace=True)

print(region_grpData.head())
print("---"*10)
print(region_grpData.tail())

In [None]:
print("Lowest Avocado Price")
print(df.loc[df.AveragePrice == df.AveragePrice.min(), 'region'])

print("--"*10)

print("Highest Avocado Price")
df.loc[df.AveragePrice == df.AveragePrice.max(), 'region']

In [None]:
# Let's examine the lowest price data.
data = df.loc[df.region == 'CincinnatiDayton']
data.head()

In [None]:
# Let's see the shape of Lowest Price Data.
data.shape

In [None]:
# Reset index.
data.reset_index(drop=True, inplace=True)
data.head()

In [None]:
# Distribution of Data in numeric datatype.
columns = ['AveragePrice', 'TotalSales', 'A4046Sales', 'A4225Sales', 'A4770Sales', 'TotalBags', 'SmallBags', 'LargeBags', 'XLargeBags']

fig,axs = plt.subplots(3, 3, figsize=(18, 12))
l = [(x, y) for x in range(3) for y in range(3)]

for i in range(len(columns)):
    ax1 = sns.distplot(data[columns[i]], ax = axs[l[i]])
    
plt.show()

In [None]:
plt.figure(figsize=(12, 8))

sns.scatterplot(x='AveragePrice', y='TotalSales', hue='type',data = data)

plt.show()

In [None]:

a = pd.DataFrame(data.groupby(by=['Month','type']).sum().TotalSales)

a.unstack().plot(kind='line', figsize=(14, 8))
plt.show()

In [None]:
a = pd.DataFrame(data.groupby(by=['Month','type']).mean().AveragePrice)

a.unstack().plot(kind='line', figsize=(14, 8))
plt.show()

**Sales and Price has same trends for region CincinnatiDayton**