In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt # for dates manipulation
import seaborn as sns # for cool graphics if needed
import warnings # to get rid of anoying warnings
import matplotlib.pyplot as plt

#To ignore some warnings
warnings.filterwarnings ("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/avocado-prices-2020/avocado-updated-2020.csv')
df.head()

# The Hass varietal avocado was patented by the U.S. postman Rudolph Hass in 1935
#
# Following Size & Product Look Up (PLU) Codes:
# 
# Small/Medium Hass Avocado (~3-5oz avocado) | #4046
# Large Hass Avocado (~8-10oz avocado) | #4225
# Extra Large Hass Avocado (~10-15oz avocado) | #4770

In [None]:
# Let's change the PLU codes by small, large, extra_large as we are talking about Hass varietal

df = df.rename(columns={'4046':'small', '4225':'large', '4770':'xlarge'})
df.head()

In [None]:
df.info()

In [None]:
# Let's create a datetime type
df['date_time'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# and add some extra features
df['date_month'] = df['date_time'].dt.month
df['date_week'] = df['date_time'].dt.week

# To avoid scientific notation
pd.options.display.float_format = '{:.2f}'.format

#Checking
df.info()

In [None]:
#To see how many unique names are there
df['geography'].nunique()

In [None]:
#To see the unique names
df['geography'].unique()

In [None]:
#Let's work with sub-totals first
these=['West', 'Southeast', 'South Central', 'Plains', 'Northeast', 'Midsouth', 'Great Lakes', 'Grand Rapids']
df[df['geography'].isin(these)]['geography'].unique()

In [None]:
# Quick but complex EDA table here
df[df['geography'].isin(these)][['geography', 'type', 'date', 'date_month', 'date_week', 'average_price', 'total_volume', 'small', 'large', 'xlarge']].groupby(['date', 'geography', 'type']).mean()

In [None]:
# Filtering and creating a new dataframe
dfg = df[df['geography'].isin(these)]
dfg.head()

In [None]:
# Set the style
sns.set_style('whitegrid')

# Describe the average_price per month per geography and type:
for g in list(dfg['geography'].unique()):
    # Prepare the canvas size
    plt.figure(figsize=(12,8))
    plt.title('average_price per month in {}'.format(g))
    sns.boxplot(x='date_month', y='average_price', hue='type', data=dfg, palette='pastel')
    plt.show()

In [None]:
# Describe the average_price per year per geography and type:
for g in list(dfg['geography'].unique()):
    # Prepare the canvas size
    plt.figure(figsize=(12,8))
    plt.title('average_price per year in {}'.format(g))
    sns.boxplot(x='year', y='average_price', hue='type', data=dfg, palette='pastel')
    plt.show()

It is clear that in the US the prices of organic avocados go up between Jun and Oct every year. However the year 2017 was when the prices were at a maximum for both types (organic and conventional).

In [None]:
dfg = df[df['geography'].isin(these)]

In [None]:
dfyg=dfg.groupby(['year','geography','type'])['geography','total_volume','small','large','xlarge'].sum().reset_index()
dfyg

In [None]:
sns.factorplot(x='geography', y='small', col='type', hue='year', data=dfyg, kind='bar', height=5, aspect=2).set_xticklabels(rotation=45)

Regarding small size consumption we can observe:
* It shows conventional type is the most sold accross all states by far
* Is more or less similar across the years in South Central, Southeast and Plains states
* Is decreasing across the years in West states
* Is increasing across the years in Midsouth, Great Lakes and Northeast
* Grand Rapids states are where are less consumed

In [None]:
sns.factorplot(x='geography', y='large', col='type', hue='year', data=dfyg, kind='bar', height=5, aspect=2).set_xticklabels(rotation=45)

Regarding large size consumption we can observe:
* It shows conventional type is the most sold accross all states by far
* Is more or less similar across the years in Northeast and Plains states
* Is decreasing across the years in West, South Central, Midsouth and Great Lakes states
* Grand Rapids states are where are less consumed

In [None]:
#sns.factorplot(x='geography', y='xlarge', col='type', hue='year', data=dfyg, kind='bar')
sns.factorplot(x='geography', y='xlarge', col='type', hue='year', data=dfyg, kind='bar', height=5, aspect=2).set_xticklabels(rotation=45)

Regarding extra-large size consumption we can observe:
* It shows conventional type is the most sold accross all states by far
* Is more or less similar across the years in West states with a drop in year 2020
* Is decreasing across the years in Midsouth and Great Lakes states
* Grand Rapids states are where is less consumed but this is increasing across the years
* South Central, Southeast and Northeast states there is no clear trend but seems decreasing there

In [None]:
sns.factorplot(x='geography', y='total_volume', col='type', hue='year', data=dfyg, kind='bar', height=5, aspect=2).set_xticklabels(rotation=45)

Overall consumption we can observe:
* It shows conventional type is the most sold accross all states by far
* Is more or less similar across the years in Grand Rapid states (seem stagnated there)
* It appears to be a steady increment in consumption across the years it altmost all regions but Grand Rapids.
* In the case of organic type, even is the less sold it seems its consuption is gaining popularity in Great Lakes, Midsouth, Northeast, South Central and West states

Also it is clear that in the US the prices of organic avocados go up between Jun and Oct every year. However the year 2017 was when the prices were at a maximum for both types (organic and conventional).

In [None]:
# Let's work now at city level
these=['Albany', 'Atlanta', 'Baltimore/Washington', 'Boise', 'Boston','Buffalo/Rochester', 'Charlotte', 'Chicago', 'Cincinnati/Dayton', 'Columbus', 'Dallas/Ft. Worth', 'Denver', 'Harrisburg/Scranton','Hartford/Springfield', 'Houston', 'Indianapolis', 'Jacksonville', 'Las Vegas', 'Los Angeles', 'Louisville', 'Miami/Ft. Lauderdale','Nashville', 'New Orleans/Mobile', 'New York', 'Northern New England', 'Orlando', 'Philadelphia','Phoenix/Tucson', 'Pittsburgh', 'Portland','Raleigh/Greensboro', 'Richmond/Norfolk', 'Roanoke', 'Sacramento','San Diego', 'San Francisco', 'Seattle','Spokane', 'St. Louis', 'Syracuse','Tampa']
df[df['geography'].isin(these)]['geography'].unique()

In [None]:
dfc = df[df['geography'].isin(these)]
dfc.head()

In [None]:
# Set the style
sns.set_style('whitegrid')

# Describe the average_price per month per geography/city and type:
for g in list(dfc['geography'].unique()):
    # Prepare the canvas size
    plt.figure(figsize=(12,8))
    plt.title('average_price per month in {}'.format(g))
    sns.boxplot(x='date_month', y='average_price', hue='type', data=dfc, palette='pastel')
    plt.show()

In [None]:
# Describe the average_price per year per geography/city and type:
for g in list(dfc['geography'].unique()):
    # Prepare the canvas size
    plt.figure(figsize=(12,8))
    plt.title('average_price per year in {}'.format(g))
    sns.boxplot(x='year', y='average_price', hue='type', data=dfc, palette='pastel')
    plt.show()

In [None]:
dfyc=dfc.groupby(['year','geography','type'])['geography','total_volume','small','large','xlarge'].sum().reset_index()
dfyc

In [None]:
sns.factorplot(x='geography', y='small', col='type', hue='year', data=dfyc, kind='bar', height=5, aspect=2).set_xticklabels(rotation=90)

In [None]:
sns.factorplot(x='geography', y='large', col='type', hue='year', data=dfyc, kind='bar', height=5, aspect=2).set_xticklabels(rotation=90)

In [None]:
sns.factorplot(x='geography', y='xlarge', col='type', hue='year', data=dfyc, kind='bar', height=5, aspect=2).set_xticklabels(rotation=90)

There are several conclusions, but to me the most relevant are:
* The shift of small and large sizes avocado's consumption in Los Angeles towards extra-large size across the years.
* The shift of large and exra-large sizes avocado's consumption in Chicaco towards small size across the years.

It would be interesting to know why that happened, it could be associated to population characteristics or migration between cities, I guess.

In [None]:
# Let's try to do a fancy graphic here about 'median' weekly consumption per city across all observations
pd.pivot_table(dfc, index='geography', aggfunc='median', values=['small', 'large', 'xlarge']).plot.bar(figsize=(20,8),title='median weekly consumption per city', stacked=True)

In [None]:
# Let's try to do another graphic here about consumption per city
pd.pivot_table(dfc, index=['geography', 'type'], values=['average_price'], columns='year', aggfunc=[np.mean])

Since the organic type does not sell well let's try to simplify the graph to get a glimpse of a heatmap of prices across years by city

In [None]:
pt = pd.pivot_table(dfc, index=['geography'], values=['average_price'], columns='year', aggfunc=[np.mean])
pt

In [None]:
plt.figure(figsize = (18,12))
sns.heatmap(pt, cmap="coolwarm").set_xlabel('average_price')

In [None]:
a = pd.pivot_table(dfc, index=['geography'], values=['average_price'], aggfunc=[np.mean])
a

In [None]:
a.idxmin()

In [None]:
a.idxmax()

Across the years one might encounter the best prices for Avocados in 'Houston' and the worst prices in 'Hartford/Springfield' ('San Franciso' and 'New York' follow)

In [None]:
# Let's repeat the graph but for states grouping

ptg = pd.pivot_table(dfg, index=['geography'], values=['average_price'], columns='year', aggfunc=[np.mean])
ptg
plt.figure(figsize = (12,8))
sns.heatmap(ptg, cmap="coolwarm").set_xlabel('average_price')

In [None]:
a = pd.pivot_table(dfg, index=['geography'], values=['average_price'], aggfunc=[np.mean])
a

In [None]:
a.idxmin()

In [None]:
a.idxmax()

Across the years one might encounter the best prices for Avocados in 'South Central' and the worst prices in 'Northeast' ('Grand Rapids' and 'Plains' follow)