# Pandas and ggplt/nineplot for Visualization

In [1]:
# Import all of the needed libraries 
import pandas as pd
import matplotlib.pyplot as plt
from plotnine import *
import matplotlib as mpl
mpl.rcParams['patch.force_edgecolor'] = True
%matplotlib inline

In [2]:
#read the data and see the data
data = pd.read_csv("../input/avocado.csv",index_col=0,parse_dates=['Date'])
data.head()

In [3]:
#Are there any missing data datapoints or do any cols need to be manipulated/dropped/transformed?
data.info()

In [4]:
#Does the data looks evenly distributed?
print(data['type'].value_counts())
print(data.groupby('region')['Total Volume'].count())

In [5]:
data['year'].value_counts()

#### Lets create new tables for US cities, regions and the country itself 

In [6]:
us_total = data.loc[data['region'].isin(['TotalUS']), :]
#select all regions except for the listed ones
us_cities = data.loc[~data['region'].isin(['TotalUS', 'GreatLakes', 'Southeast', 'Midsouth', 'Northeast',
                                           'SouthCentral', 'California','West', 'WestTexNewMexico', 'NorthernNewEngland']), :]
#select all of the regions
us_regions = data.loc[data['region'].isin(['GreatLakes', 'Southeast', 'Midsouth', 
                                           'Northeast', 'SouthCentral','West',
                                           'WestTexNewMexico', 'NorthernNewEngland']), :]

#### There are some missing cities from our dataset; therefore, we do have different values 

In [7]:
print(us_total.groupby('type')['AveragePrice'].mean())
print(us_cities.groupby('type')['AveragePrice'].mean())

In [8]:
print(us_total['Total Volume'].sum())
print(us_cities['Total Volume'].sum())

## Vizs

### What cities do demand avocados the most, and do they pay higher prices than others?
### Does the Total Volume have a negative relationship with AveragePrice? 

In [9]:
tot_vol = us_cities[us_cities['type']=='conventional'].groupby('region')['Total Volume', 'AveragePrice'].mean().sort_values(by='AveragePrice')
tot_vol.plot(kind='barh', figsize=(12,18), logx=True)

In [10]:
plt.figure(figsize=(12,12))
#fig, (ax1, ax2) = plt.subplots(1, 2)
tot_vol = us_cities[us_cities['type']=='conventional'].groupby('region')['Total Volume', 'AveragePrice'].mean().sort_values(by='AveragePrice')
tot_vol.plot(kind='scatter',x='Total Volume', y='AveragePrice',figsize=(10,6), logx=True)

In [11]:
fig, (ax1, ax2) = plt.subplots(2, 1)
fig.set_figheight(8)
fig.set_figwidth(14)
us_total[us_total['type']=='conventional'].groupby('Date')['Total Volume'].sum().plot(kind='line', ax=ax1, subplots=True)
us_total[us_total['type']=='conventional'].groupby('Date')['AveragePrice'].mean().plot(kind='line', ax=ax2, subplots=True)

#### Finally, through this chart we can observe some relationship between volume and price (as volume/supply of avocados shrink, the price for it should go up and vice-versa)

In [12]:
fig, (ax1, ax2) = plt.subplots(2, 1)
fig.set_figheight(10)
fig.set_figwidth(14)
us_total.groupby(us_total['Date'].dt.month)['Total Volume'].sum().plot(kind='bar', ax=ax1, ylim=(308992617, 685625110), subplots=True)
us_total.groupby(us_total['Date'].dt.month)['AveragePrice'].mean().plot(kind='bar', ax=ax2, ylim=(1, 1.6), subplots=True)

#### Total Volume for avocados peaks in Feb and has the lowest average prices around this time of the year while in Sep, it has lowest Volume and highest price  

In [13]:
import warnings
warnings.filterwarnings('ignore')
(
    ggplot(data=us_regions)
      + aes(y='AveragePrice', x='Total Volume')
      + aes(color='region', shape='region')
      + geom_point(alpha=0.5)
      + scale_x_log10()
      + coord_fixed(ratio=3/4)
      + facet_wrap('~type', nrow=2, ncol=1)
      + theme_classic()
)

#### If you want to play with an interactive chart use this one (which has <<<%matplotlib notebook>>> command 

In [14]:
import warnings
warnings.filterwarnings('ignore')
(
    ggplot(data=us_regions)
      + aes(y='AveragePrice', x='Total Volume')
      + aes(color='region', shape='region')
      + geom_point(alpha=0.5)
      + scale_x_log10()
      + coord_fixed(ratio=3/4)
      + facet_wrap('~type', nrow=2, ncol=1)
      + theme_classic()
)
%matplotlib notebook