In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploring the Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("../input/avocado-prices/avocado.csv")
data.head()

pd.options.display.float_format = '{:,.2f}'.format

- Date - The date of the observation
- AveragePrice - the average price of a single avocado
- type - conventional or organic
- year - the year
- Region - the city or region of the observation
- Total Volume - Total number of avocados sold
- 4046 - Total number of avocados with PLU 4046 sold
- 4225 - Total number of avocados with PLU 4225 sold
- 4770 - Total number of avocados with PLU 4770 sold

In [None]:
data.info()

In [None]:
data.shape

In [None]:
#Looking at the different regions

data["region"].unique()

In [None]:
data.describe()

# General avocado trends
1. Look at average price over time.
2. Look at average volume sold over time.
3. Is there a correlation between price and volume?

In [None]:
# Adding month to the dataset

data['month'] = pd.DatetimeIndex(data['Date']).month
data.head()

In [None]:
prices = pd.pivot_table(data, values = "AveragePrice", index = "month", columns = "year")

In [None]:
prices.plot.line(title = "Average Avocado Price", ylabel = "Price", xlabel = "Month")

Avocado prices tend to increase between July to November each year.

In [None]:
volume = pd.pivot_table(data, values = "Total Volume", index = "month", columns = "year")

In [None]:
volume.plot.line(title = "Average Volume Sold", ylabel = "Volume", xlabel = "Month")

Volume of avocados sold tends to spike in February and May before decreasing gradually until November.

In [None]:
p2015 = prices[2015]
v2015 = volume[2015]
table2015 = pd.concat([p2015, v2015], axis = 1)
table2015.columns = ["Price", "Volume"]

p2016 = prices[2016]
v2016 = volume[2016]
table2016 = pd.concat([p2016, v2016], axis = 1)
table2016.columns = ["Price", "Volume"]

p2017 = prices[2017]
v2017 = volume[2017]
table2017 = pd.concat([p2017, v2017], axis = 1)
table2017.columns = ["Price", "Volume"]

p2018 = prices[2018]
v2018 = volume[2018]
table2018 = pd.concat([p2018, v2018], axis = 1)
table2018.columns = ["Price", "Volume"]

In [None]:
fig, axs = plt.subplots(2,2, sharex = True, sharey = True)
fig.suptitle("Price vs Volume Sold")
fig.set_size_inches(15, 9)

axs[0,0].scatter(table2015["Volume"],table2015["Price"])
axs[0,0].set_title('2015')
axs[0,0].set(ylabel = "Price")

axs[0,1].scatter(table2016["Volume"],table2016["Price"])
axs[0,1].set_title('2016')

axs[1,0].scatter(table2017["Volume"],table2017["Price"])
axs[1,0].set_title('2017')
axs[1,0].set(xlabel = "Volume", ylabel = "Price")

axs[1,1].scatter(table2018["Volume"],table2018["Price"])
axs[1,1].set_title('2018')
axs[1,1].set(xlabel = "Volume")
axs[1,1].ticklabel_format(useOffset=False, style='plain')



- In 2015, price changes did not have much effect on the total volume of avocado sold.
- In 2016, 2017, 2018, price and volume sold were negatively correlated, obeying the law of demand.
- The overall demand for avocados increased from year to year, indicating their increasing popularity.

# Avocado Popularity by Region
- Which 5 regions consumes the most and least avocados.
- Has the demand for avocados across these regions been increasing from year to year?

In [None]:
regions = pd.pivot_table(data, values = "Total Volume", index = "region", aggfunc=np.sum)
regions.sort_values("Total Volume", ascending = False).head()

Some of the regions are cities, some are states, and some cover several states / areas. Because of this, we will look at the top and bottom cities only.

Top 5:
- Los Angeles
- New York
- DallasFtWorth (Dallas Fort Worth)
- Houston
- PhoneixTucson (Tucson)

Bottom 5:
- Louisville
- Albany
- Spokane
- Boise
- Syracuse


In [None]:
topcities = data[(data["region"] == "LosAngeles") | (data["region"] == "NewYork") | 
                 (data["region"] == "DallasFtWorth") | (data["region"] == "Houston") | 
                 (data["region"] == "PhoenixTucson")]

topcities

In [None]:
bottomcities = data[(data["region"] == "Louisville") | (data["region"] == "Albany") | 
                 (data["region"] == "Spokane") | (data["region"] == "Boise") | 
                 (data["region"] == "Syracuse")]

bottomcities

In [None]:
data["Date"].max()

In [None]:
t_citiestable = pd.pivot_table(topcities, values = "Total Volume", index = "year", columns = "region", aggfunc=np.sum)

#remove 2018 as data only goes up to March 2018.
t_citiestable = t_citiestable.drop(2018, axis = 0)

t_citiestable

In [None]:
ax = t_citiestable.plot(xticks = t_citiestable.index)
ylab = ax.set_ylabel('Avocados Sold')
ax.ticklabel_format(useOffset=False, style='plain')
ax.set_title('Avocado Sales in Cities with Highest Demand')

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
b_citiestable = pd.pivot_table(bottomcities, values = "Total Volume", index = "year", columns = "region", aggfunc=np.sum)

#remove 2018 column as data only goes up to March 2018.
b_citiestable = b_citiestable.drop(2018, axis = 0)

b_citiestable

In [None]:
ax = b_citiestable.plot(xticks = b_citiestable.index)
ylab = ax.set_ylabel('Avocados Sold')
ax.ticklabel_format(useOffset=False, style='plain')
ax.set_title('Avocado Sales in Cities with Lowest Demand')

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

- Avocado sales increased in cities with highest and lowest demand between 2015 & 2016. There was only a slight increase in 2017 from 2016.

- Price vs volume sold graph shows a drastic increase in avocado sales in 2018 in months Jan, Feb & March compared to other years.

# Sales & Popularity of Different Avocado Types

- Look at demand for different avocado types over the years.
- Which avocado types are more / less popular in different regions?

In [None]:
avo_4046 = pd.pivot_table(data, values = "4046", index = "region", aggfunc = np.sum)
avo_4046.sort_values("4046", ascending = False).head()

Top Cities for 4046:
- LosAngeles
- PhoenixTucson
- DallasFtWorth
- Houston
- MiamiFtLauderdale

Bottom Cities for 4046:
- HartfordSpringfield
- Louisville
- Albany
- BuffaloRochester
- Syracuse

In [None]:
avo_4225 = pd.pivot_table(data, values = "4225", index = "region", aggfunc = np.sum)
avo_4225.sort_values("4225", ascending = False).head()

Top Cities for 4225:
- NewYork
- LosAngeles
- Chicago
- SanFrancisco
- BaltimoreWashington

Bottom Cities for 4225:
- NewOrleansMobile
- Nashville
- StLouis
- Jacksonville
- Boise

In [None]:
avo_4770 = pd.pivot_table(data, values = "4770", index = "region", aggfunc = np.sum)
avo_4770.sort_values("4770", ascending = False).head()

Top Cities for 4770:
- Chicago
- LosAngeles
- Detroit
- Houston
- DallasFtWorth

Bottom Cities for 4770:
- Orlando
- Tampa
- Syracuse
- Roanoke
- StLouis

In [None]:
avo_types = pd.pivot_table(data, values = ["4046", "4225", "4770", "Total Volume"], index = "year", aggfunc = np.sum )
avo_types

In [None]:
avo_types = avo_types.drop(2018, axis = 0)

In [None]:
avo_types = avo_types.drop("Total Volume", axis = 1)

In [None]:
ax = avo_types.plot(xticks = avo_types.index)
ylab = ax.set_ylabel('Avocados Sold')
ax.ticklabel_format(useOffset=False, style='plain')

ax.set_title('Sales of different types of avocados')

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

- 4770 variety decreased in demand by 42.9% between 2016 and 2017.
- 4225 variety decreased in demand by 5% and 7% (compared to the previous year) in 2016 and 2017 respectively.
- All varities decreased in demand in 2017 compared to 2015. 

- Total volume of avocados sold has increased YoY, suggesting other varieties are becoming more popular.

In [None]:
avo_types = pd.pivot_table(data, values = ["4046", "4225", "4770", "Total Volume"], index = "year", aggfunc = np.sum )

avo_types["Other"] = avo_types["Total Volume"] - avo_types["4046"] - avo_types["4225"] - avo_types["4770"]
avo_pie = avo_types.drop("Total Volume", axis = 1)

avo_pie

In [None]:
fig, axs = plt.subplots(2,2)
fig.suptitle("Avocado Sales Composition")
fig.set_size_inches(15, 9)

axs[0,0].pie(avo_pie.iloc[0], autopct = '%.0f%%')
axs[0,0].set_title('2015')

axs[0,1].pie(avo_pie.iloc[1], autopct = '%.0f%%')
axs[0,1].set_title('2016')
axs[0,1].legend(avo_pie.columns, bbox_to_anchor=(1.5 , 0))

axs[1,0].pie(avo_pie.iloc[2], autopct = '%.0f%%')
axs[1,0].set_title('2017')

axs[1,1].pie(avo_pie.iloc[3], autopct = '%.0f%%')
axs[1,1].set_title('2018')


# Sales by Bag Type
- Which bags are more popular?
- What are the trends?


In [None]:
bags = pd.pivot_table(data, index = "year", values = ["Large Bags", "Small Bags", "XLarge Bags"], aggfunc = np.sum)
bags

In [None]:
fig, axs = plt.subplots(2,2)
fig.suptitle("Bags Sold Composition")
fig.set_size_inches(15, 9)

axs[0,0].pie(bags.iloc[0], autopct = '%.0f%%')
axs[0,0].set_title('2015')

axs[0,1].pie(bags.iloc[1], autopct = '%.0f%%')
axs[0,1].set_title('2016')
axs[0,1].legend(bags.columns, bbox_to_anchor=(1.5 , 0))

axs[1,0].pie(bags.iloc[2], autopct = '%.0f%%')
axs[1,0].set_title('2017')

axs[1,1].pie(bags.iloc[3], autopct = '%.0f%%')
axs[1,1].set_title('2018')


An increasingly greater proportion of large avocado bags are being sold instead of small bags. This could be due to the rising popularity of avocados and eating them more regularly.

In [None]:
bags = bags.drop(2018, axis = 0)

In [None]:
ax = bags.plot(xticks = bags.index)
ylab = ax.set_ylabel('Bags Sold')
ax.ticklabel_format(useOffset=False, style='plain')

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

More bags of each type are being sold. There was a more drastic increase in 2016 than 2017.

# Conventional or Organic?

In [None]:
# Split data by year

data_2015 = data[data["year"] == 2015]
data_2016 = data[data["year"] == 2016]
data_2017 = data[data["year"] == 2017]
data_2018 = data[data["year"] == 2018]

In [None]:
years = [2015, 2016, 2017]
dfs = [data_2015, data_2016, data_2017, data_2018]
con_tot = []
org_tot = []

In [None]:
year = 2015
for df in dfs:
    
    c_year = df[df["type"] == "conventional"]
    o_year = df[df["type"] == "organic"]

    c_year_tot = np.round(c_year["Total Volume"].sum())
    o_year_tot = np.round(o_year["Total Volume"].sum())

    print(c_year_tot, "bags of conventional avocados sold in", year)
    print(o_year_tot, "bags of organic avocados sold in", year)

    con_tot.append(c_year_tot)
    org_tot.append(o_year_tot)
    
    year += 1

In [None]:
years.append(2018)

In [None]:
con_org_dic = {"Conventional": con_tot, "Organic": org_tot}
con_org = pd.DataFrame(con_org_dic, index = years)
con_org

In [None]:
fig, axs = plt.subplots(2,2)
fig.suptitle("Organic vs Conventional Sales")
fig.set_size_inches(15, 9)

axs[0,0].pie(con_org.iloc[0], autopct = '%.0f%%')
axs[0,0].set_title('2015')

axs[0,1].pie(con_org.iloc[1], autopct = '%.0f%%')
axs[0,1].set_title('2016')
axs[0,1].legend(con_org.columns, bbox_to_anchor=(1.5 , 0))

axs[1,0].pie(con_org.iloc[2], autopct = '%.0f%%')
axs[1,0].set_title('2017')

axs[1,1].pie(con_org.iloc[3], autopct = '%.0f%%')
axs[1,1].set_title('2018')

- Proportion of organic avocados being sold is increasing every year.
- Suggests a shift in health awareness and perhaps wealth.

In [None]:
con_org = con_org.drop(2018, axis = 0)

In [None]:
ax = con_org.plot(xticks = con_org.index)
ylab = ax.set_ylabel('Bags Sold')
ax.ticklabel_format(useOffset=False, style='plain')

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Conventional to Organic Ratio by Region

In [None]:
conventional = data[data["type"] == "conventional"]
organic = data[data["type"] == "organic"]

In [None]:
data.head()

In [None]:
con_ratio = pd.pivot_table(conventional, index = "region", columns = "type", values = "Total Volume", aggfunc = np.sum)
org_ratio = pd.pivot_table(organic, index = "region", columns = "type", values = "Total Volume", aggfunc = np.sum)

In [None]:
con_org_ratio = pd.concat([con_ratio, org_ratio], axis = 1)
con_org_ratio["Total"] = con_org_ratio["conventional"] + con_org_ratio["organic"]

con_org_ratio["% Organic"] = con_org_ratio["organic"] / con_org_ratio["Total"] *100

con_org_ratio.sort_values("% Organic", ascending = False).head(8)

In [None]:
con_org_ratio.sort_values("% Organic", ascending = False).tail(8)

In [None]:
con_org_ratio["% Organic"].describe()

% Organic avocados sold can vary a bit across regions. This could be due to income, culture, and other lifestyle habits / influences.

# Conclusion

- Price fluctuates a lot between 1.2 and 1.9 over 4 years.
- Price is highest in September, lowest in Feb.
- Volume sold is highest in Febrary and May, lowest in October.
- Avocados are increasing in popularity every year across most regions.
- Highest and lowest demand for various avocado SKUs vary across cities but are generally consistent.
- Other SKUs of avocado are becoming more popular than the ones listed in the dataset,
- Big bags are becoming more popular every year, however small bags still dominate at 70%+.
- Organic varieties are increasing in popularity, increasing from 2% - 4% over 4 years.
- % organic avocados sold by region ranges from 0.68% to 6.74%.
