In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import math

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
avocado = pd.read_csv('/kaggle/input/avocado-prices-2020/avocado-updated-2020.csv')
avocado.head()

In [None]:
# let's check for null or Nan values
avocado.isnull().values.any()

In [None]:
# let's check for duplicate values
avocado.duplicated().values.any()

In [None]:
# the shape of the data
avocado.shape

In [None]:
avocado['geography'].nunique()

In [None]:
avocado['geography'].unique()

In [None]:
# filter out subtotals (total us, west, midsouth, northeast, south central and southeast)

avocado = avocado.loc[avocado['geography'] != 'Total U.S.']
avocado = avocado.loc[avocado['geography'] != 'Northeast']
avocado = avocado.loc[avocado['geography'] != 'Midsouth']
avocado = avocado.loc[avocado['geography'] != 'South Central']
avocado = avocado.loc[avocado['geography'] != 'Southeast']
avocado = avocado.loc[avocado['geography'] != 'West']

In [None]:
# the shape of the data after removing subtotal columns
avocado.shape

In [None]:
# we now have 48 unique locations
avocado['geography'].nunique()

In [None]:
# check data types
avocado.dtypes

#### From above, it's clear date column is in a wrong format,let's correct this.

In [None]:
# code to convert from object to datetime
avocado['date'] = pd.to_datetime(avocado['date'])


In [None]:
# check to see if code worked, it did
avocado.dtypes

In [None]:
# next, some columns seem irrelevant for this EDA
avocado.drop(['4046', '4225', '4770'], axis = 1, inplace=True)

In [None]:
# check result of the last code, worked
avocado.head(2)

In [None]:
# descriptive statistics, I used round function here to suppress scientific notation
avocado.describe().round()

### some basic insighs here..
We can immediate see that data is from 2015-2020,
lowest average price is 1, and hghest is 3 (we'll visualize this later)

In [None]:
# What cities had the highest/lowest volume
avocado.groupby(['geography']).sum()['total_volume'].astype(int).sort_values(ascending=False)

In [None]:
# to view easily, we can elect to view part of this information
top_10 = avocado.groupby(['geography']).sum()[['total_volume']].astype('int')
top_10 = top_10.nlargest(10, 'total_volume', keep='all')
top_10

In [None]:
# to view easily, we can elect to view part of this information
lowest_10 = avocado.groupby(['geography']).sum()[['total_volume']].astype('int')
lowest_10 = lowest_10.nsmallest(10, 'total_volume', keep='all')
lowest_10

#### Why do some cities have large sales, and others don't? There are many ways we can check this, one way is to look at population data, but since we don't have that and it's not within the scope of this EDA, I'll look at the average prices by geography...

In [None]:
# average prices by geo, sorted highest to lowest
avocado.groupby(['geography']).mean()['average_price'].sort_values(ascending = False)

#### Looking at this, some cities with low volumes have high prices!

In [None]:
# however, there's no relationship between volume & price
plt.scatter(avocado['total_volume'], avocado['average_price'] )
plt.show()

This does't tell us much, and maybe not a very suitable application in this context..

In [None]:
sns.set_style("darkgrid")

fig, ax = plt.subplots(figsize=(10,6))
fig.suptitle('Avocado Average Prices Distribution', fontsize = 20)
ax1 = sns.histplot(avocado[avocado['type']=='conventional']["average_price"],
                   color = 'y', label = 'Conventional', kde=True,
                   stat="density",linewidth=0)
ax2 = sns.histplot(avocado[avocado['type']=='organic']["average_price"],
                   color = 'g', label = 'Organic', kde=True, stat="density",
                   linewidth=0)
ax.set_xlabel('Average Price')
plt.legend(title_fontsize = 12)
plt.show()

####  Above graph gives us some insights.

Highest price of conventional avocados is around 2,
Higghest price of organic avocados is around 3.
Combination of both types of avocados puts average price at 1.4
More conventional types were sold.

In [None]:
# yearly average prices by avocado type
fig, ax = plt.subplots(figsize=(10,6))
sns.lineplot(data=avocado, x="year", y="average_price", hue="type", ci = None)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

We can see a pattern, but let's view this data in a different way, hence, let's view by month.

This will give us a more detailed chart, and clearer insight..

In [None]:
fig, ax = plt.subplots(figsize=(14,7))
ax.set_title('Trend of Average Price by Month')
ax.set_xlabel('Date')
ax.set_ylabel('Average Price')
sns.lineplot(data=avocado, x="date", y="average_price", hue="type", ci=None)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') 
plt.show()

Some patterns emerge, but let's drill down some more  view by month cumulative, each year..

In [None]:
# To achieve what we set out above, we need to add a month column

avocado['month'] = avocado['date'].apply(lambda x: x.strftime(format='%m'))

In [None]:
# new 'month' column added 
avocado.head(3)

In [None]:
# Let's filter out month, and average price for each year, group by month, then find average

data = avocado.loc[avocado['year']==2015, ['month', 'average_price']]
pp_2015 = data.groupby(['month']).mean().round(2)

data = avocado.loc[avocado['year']==2016, ['month', 'average_price']]
pp_2016 = data.groupby(['month']).mean().round(2)

data = avocado.loc[avocado['year']==2017, ['month', 'average_price']]
pp_2017 = data.groupby(['month']).mean().round(2)

data = avocado.loc[avocado['year']==2018, ['month', 'average_price']]
pp_2018 = data.groupby(['month']).mean().round(2)

data = avocado.loc[avocado['year']==2019, ['month', 'average_price']]
pp_2019 = data.groupby(['month']).mean().round(2)

data = avocado.loc[avocado['year']==2020, ['month', 'average_price']]
pp_2020 = data.groupby(['month']).mean().round(2)

In [None]:
# Visualization for the Monthly Average Price of Avocado

fig, ax = plt.subplots(figsize=(10,6))

plt.plot(pp_2015, label='2015 Avocado Price')
plt.plot(pp_2016, label='2016 Avocado Price')
plt.plot(pp_2017,  label='2017 Avocado Price')
plt.plot(pp_2018, label='2018 Avocado Price')
plt.plot(pp_2019, label='2019 Avocado Price')
plt.plot(pp_2020, label='2020 Avocado Price')

ax.set_xlabel("Months")
ax.set_ylabel("Avacodo AVG Price")
ax.set_title("Monthly Average Price per Unit of Avocado.")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') 
plt.show()

We can see some observations as follows;

Average prices usually dropped in February, for every year.

Average prices peaked in July every year, except in year 2020.

Average prices also tend to drop at the end of the year, except in 2018.

From April, prices dropped steadily in 2020;
This could be attributed to the impact of COVID19 lockdowns.

It will be interesting to see if this trend continues in future years!

### Conclusion

This basic exploratory data analysis revealed a few things;

Unique locations.

Volume by location.
.
Locations with highest & lowest volumes.

Average prices (overall and by type).

Spread of the average price.

Monthly trend of average prices according to each year, etc.


##### However, there's still a lot more insights we can get from this data, such as;

Insights from sub regions/sub totals which I filtered out,

Advanced time series analysis, and,

Predictive analysis to know what the prices could be like in future.

I will be addressing these in subsequent projects. Thanks for your time.