<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
from datetime import datetime
from plotly.offline import init_notebook_mode, iplot
import plotly.io as pio
import plotly.express as px

init_notebook_mode(connected=True)  
pio.templates.default='simple_white'

In [2]:
df = pd.read_csv('app/static/data/avocado.csv')
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)
df.Date = pd.to_datetime(df.Date, format='%Y-%m-%d')
df.sort_values('Date', inplace=True, ascending=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-01-04,1.75,27365.89,9307.34,3844.81,615.28,13598.46,13061.1,537.36,0.0,organic,2015,Southeast
1,2015-01-04,1.49,17723.17,1189.35,15628.27,0.0,905.55,905.55,0.0,0.0,organic,2015,Chicago
2,2015-01-04,1.68,2896.72,161.68,206.96,0.0,2528.08,2528.08,0.0,0.0,organic,2015,HarrisburgScranton
3,2015-01-04,1.52,54956.8,3013.04,35456.88,1561.7,14925.18,11264.8,3660.38,0.0,conventional,2015,Pittsburgh
4,2015-01-04,1.64,1505.12,1.27,1129.5,0.0,374.35,186.67,187.68,0.0,organic,2015,Boise


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          18249 non-null  object 
 1   AveragePrice  18249 non-null  float64
 2   Total Volume  18249 non-null  float64
 3   4046          18249 non-null  float64
 4   4225          18249 non-null  float64
 5   4770          18249 non-null  float64
 6   Total Bags    18249 non-null  float64
 7   Small Bags    18249 non-null  float64
 8   Large Bags    18249 non-null  float64
 9   XLarge Bags   18249 non-null  float64
 10  type          18249 non-null  object 
 11  year          18249 non-null  int64  
 12  region        18249 non-null  object 
dtypes: float64(9), int64(1), object(3)
memory usage: 1.8+ MB


In [4]:
# Formato fecha
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          18249 non-null  datetime64[ns]
 1   AveragePrice  18249 non-null  float64       
 2   Total Volume  18249 non-null  float64       
 3   4046          18249 non-null  float64       
 4   4225          18249 non-null  float64       
 5   4770          18249 non-null  float64       
 6   Total Bags    18249 non-null  float64       
 7   Small Bags    18249 non-null  float64       
 8   Large Bags    18249 non-null  float64       
 9   XLarge Bags   18249 non-null  float64       
 10  type          18249 non-null  object        
 11  year          18249 non-null  int64         
 12  region        18249 non-null  object        
dtypes: datetime64[ns](1), float64(9), int64(1), object(2)
memory usage: 1.8+ MB


In [5]:
# Filtros
selected_regions = df.region.unique()[:3]
filter_region = df.region.isin(selected_regions)
filter_type = df.type == 'conventional' 
filter_date = df.Date.between(datetime(2015, 1, 1), datetime(2016,1,1))
df_line = df[(filter_region)&(filter_type)&(filter_date)] 

In [6]:
df_line.head()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
40,2015-01-04,1.11,783068.03,30270.26,550752.19,124506.1,77539.48,72888.46,4651.02,0.0,conventional,2015,Chicago
58,2015-01-04,0.98,3204112.16,2296069.27,320373.63,18938.42,568730.84,287820.14,280910.7,0.0,conventional,2015,Southeast
104,2015-01-04,1.05,203939.14,32679.84,121020.54,1286.19,48952.57,47583.64,1368.93,0.0,conventional,2015,HarrisburgScranton
112,2015-01-11,1.18,2433295.61,1667012.92,269027.02,21918.53,475337.14,260164.0,215173.14,0.0,conventional,2015,Southeast
175,2015-01-11,1.26,178735.25,34535.57,78778.96,582.3,64838.42,63840.09,998.33,0.0,conventional,2015,HarrisburgScranton


In [7]:
fig = px.line(df_line, x='Date', y='AveragePrice', color='region', 
              title='Average Price of Avocados per Region',
              labels={'AveragePrice':'',
                      'Date':''})


fig.update_traces(hovertemplate='$%{y:.2f}')
fig.update_yaxes(tickprefix='$')
fig.update_xaxes(showspikes=True, spikethickness=1, spikecolor='#999999', spikemode='across')
fig.update_layout(hovermode='x unified', hoverdistance=200, spikedistance=100)

fig.show()


In [8]:
selected_regions = df.region.unique()[:3]
filter_region = df.region.isin(selected_regions)
filter_type = df.type == 'conventional' 
filter_date = df.Date.between(datetime(2015, 1, 1), datetime(2016,1,1))

columns = ['region', 'Total Volume']
df_donut = df[(filter_region)&(filter_type)&(filter_date)][columns]
df_donut = df_donut.groupby('region').sum().reset_index()
df_donut.head()

Unnamed: 0,region,Total Volume
0,Chicago,40321610.0
1,HarrisburgScranton,11485390.0
2,Southeast,158471500.0


In [9]:
fig = px.pie(data_frame=df_donut,
             names='region',
             values='Total Volume',
             hole=0.5,
             title='Total Volume per Region')
fig.show()