In [None]:
import numpy as np 
import pandas as pd 
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

# Grocery Expenses Analysis

Here's I'm gonna to analyze all the data that I've collected about my family grocery expenses

In [None]:
filepath = '../input/grocery-exprenses/spesa_dataset.csv'
df = pd.read_csv(filepath, delimiter=";", encoding = "cp1252")
df.head(-10)

## Preliminary Pre-processing

First of all, let's check if there are any inconsistencies in the dataset. We're going to do it by checking the type of the dataframe columns

In [None]:
df.dtypes

In [None]:
#fixme:to remove, it's just a temporary error
df.loc[df['giorno'] == 'Lidl24/08/2020']
df.loc[907, 'giorno'] = '24/08/2020'

df.loc[906, 'supermercato'] = 'Lidl'

### Parsing dates

Since 'giorno' (in italian) means day in english, let's cast the 'giorno' column as a datetime object. This will allow us to group our data using this column.

In [None]:
df['giorno'] = pd.to_datetime(df['giorno'], infer_datetime_format=True)

print(df.giorno.dtypes)

### Clean incosistent text entries

Now let's just see if the text values has some error in it.

We're going to look at the supermarket feature first.

In [None]:
supermarkets = df['supermercato'].unique()

supermarkets.sort()
supermarkets

By looking the previos result, we can see a incosistent data entry: 'Lidl' and 'Lidl ' due to an additional space in the second entry. 
So by just removing the white space, everything should be fine.

In [None]:
df['supermercato'] = df['supermercato'].str.strip()

Now we going the same process for the 'tipo' column ('tipo' means 'type' in english). But before doing it, let's see if there are some missing values.

In [None]:
missing_types = df['tipo'].isnull().sum()

missing_types

So by default, to the grocery items which I don't know how to categorize, I give them the type 'none'. 

In [None]:
df['tipo'] = df['tipo'].fillna('none')

After we have no more missing values, let's list all the unique values in the type columns in order to seek for inconsistencies.

In [None]:
types = df['tipo'].unique()

types.sort()
types

By looking at the types list we can see that there are a lot inconsistencies. For example: 'frutta secca' and 'fruttasecca' or 'dolce' and 'dolci' and a lot more.

Here's I'm gonna make a list with all the inconsistencies in the types, and I'm gonna use the fuzzywuzzy package to help me identify which string are closest to each other. 

In [None]:
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

incostencies = ["frutta secca", "passata pomodoro", "bevande","dolce","integratore","briosche","aceto","borsa spesa","gnocchi","crackers"]

matches_list = []

for el in incostencies:
    matches = fuzzywuzzy.process.extract(el, types, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
    print(matches, end='\n\n')
    matches_list.append(matches)

What I'm interestin in here is to replace all the second values of the matches with the first, so let's write a function to help us to  do that. We'll call it *replace_second_match()*.

In [None]:
def replace_second_match(df:pd.DataFrame, column:str, matches:list):
    close_matches = [matches[1][0]]
    row_with_matches = df[column].isin(close_matches)
    
    df.loc[row_with_matches, column] = matches[0][0] 
    
for el in matches_list:
    replace_second_match(df, 'tipo', el)

In [None]:
types = df['tipo'].unique()

types.sort()
types

As we can see now the list looks better, even though we have to make some adjustment manually because some types that are synonyms or includes other types , but they doesn't looks similar.

In [None]:
row_with_matches = df['tipo'].isin(['arachidi'])
    
df.loc[row_with_matches, 'tipo'] = 'frutta secca'

row_with_matches = df['tipo'].isin(['bibite'])
    
df.loc[row_with_matches, 'tipo'] = 'bevande'

row_with_matches = df['tipo'].isin(['gnochetti'])
    
df.loc[row_with_matches, 'tipo'] = 'gnocchi'

## Basic Statistic About Each feature of the dataset

After cleaning the inconsistencies let's take a look at the basic stats about each feature of the dataset. 

We'll start by looking the numerical Features and then we'll look at the categorical feature.

### Numerical Features

In [None]:
# just a simple description of the dataset
df.describe(include=np.number)

### Categorical Features

In [None]:
# most frequent name in the grocery dataset
df['nome'].mode()

In [None]:
# most frequent type of grocery item in the dataset
df['tipo'].mode()

In [None]:
# most frequent supermarket on the dataset
df['supermercato'].mode()

## Most bought items

In [None]:
def fromSeriesToLists(pd_serie, threshold=0):
    keys = []
    values = []
    
    for key, value in pd_serie.items():
        if value > threshold:
            keys.append(key)
            values.append(value)
            
    return keys, values

In [None]:
most_freq_items = df.nome.value_counts()

names, values = fromSeriesToLists(most_freq_items, 4)

freq_items_df = pd.DataFrame(
    data = {
        'Names': names, 
        'Values':values
    }
)

sns.barplot(x=freq_items_df['Names'], y=freq_items_df['Values'])
plt.xticks(rotation=70)

## Most Frequent Grocery Item Types

In [None]:
most_freq_types = df.tipo.value_counts()

names, values = fromSeriesToLists(most_freq_types, 15)


freq_types_df = pd.DataFrame(
    data = {
        'Names': names, 
        'Values':values
    }
)

sns.barplot(x=freq_types_df['Names'], y=freq_types_df['Values'])
plt.xticks(rotation=70)

## Supermarket Frequencies

In [None]:
most_freq_super = df.supermercato.value_counts()

names, values = fromSeriesToLists(most_freq_super)


freq_super_df = pd.DataFrame(
    data = {
        'Names': names, 
        'Values':values
    }
)

sns.barplot(x=freq_super_df['Names'], y=freq_super_df['Values'])
plt.xticks(rotation=70)

### Price Analysis

- price distribution: First of all let's visualize how the price is distributing in the dataset

In [None]:
sns.distplot(a=df['prezzo'], kde=False)

- how much we spend per type (the most frequent type)

In [None]:
# tot = df.groupby(['tipo']).sum()
# freq_type_name = freq_types_df.Names

# tot.reset_index(drop=True, inplace=True)

# # tot = tot[tot['tipo'].isin(freq_type_name)]

# tot
# # sns.barplot(x=tot[tot.isin(freq_type_name)], y=tot[tot['prezzo'].isin(freq_type_name)])
# # plt.xticks(rotation=70)

- total expenditure

In [None]:
# total for now
df['prezzo'].sum()

## Weekly stats

In the weekly statistic I include how much we spend per week, the average price we spend per day, and the medium price we spend per item and its standard deviation

In [None]:
df['giorno'] = pd.to_datetime(df['giorno'], infer_datetime_format=True)
weekly_gr = df.groupby(df.giorno.dt.strftime('%W'))

weekly = {
    'week':[],
    'weekly_shopping': [], 
    'amount_per_day': [], 
    'price_mean': [],
    'price_std': [],
    'most_freq_item': [],
    'most_freq_type':[]
}

for name, group in weekly_gr:
    if len(group) > 0:
        tot = group.prezzo.sum()
        tot_per_day = tot / 7
        mean = group.prezzo.mean()
        std = group.prezzo.std()
        weekly['week'].append(name)
        weekly['weekly_shopping'].append(tot)
        weekly['amount_per_day'].append(tot_per_day)
        weekly['price_mean'].append(mean)
        weekly['price_std'].append(std)
        weekly['most_freq_item'].append(group.nome.value_counts().idxmax())
        weekly['most_freq_type'].append(group.tipo.value_counts().idxmax())

weekly_df = pd.DataFrame(weekly)
weekly_df

### Stats about the weekly expenditure

In [None]:
weekly_df.describe(include=np.number)

### Distribution of the weekly expenses and daily amount 
Now Let's see the distribution of the weekly shopping amount and the weekly expense amount per day.

In [None]:
fig, axs = plt.subplots(ncols=2)
sns.distplot(a=weekly_df['weekly_shopping'], kde=False, ax=axs[0])
sns.distplot(a=weekly_df['amount_per_day'], kde=False,bins=5, ax=axs[1])

# Monthly Stats

Same parameters of the weekly stats but monthly

In [None]:
monthly_gr = df.groupby(df.giorno.dt.strftime('%m'))

monthly = {
    'month':[],
    'monthly_shopping': [], 
    'amount_per_week': [], 
    'price_mean': [],
    'price_std': [],
    'most_freq_item': [],
    'most_freq_type':[]
}

for name, group in monthly_gr:
    if len(group) > 0:
        tot = group.prezzo.sum()
        tot_per_day = tot / 4
        mean = group.prezzo.mean()
        std = group.prezzo.std()
        monthly['month'].append(name)
        monthly['monthly_shopping'].append(tot)
        monthly['amount_per_week'].append(tot_per_day)
        monthly['price_mean'].append(mean)
        monthly['price_std'].append(std)
        monthly['most_freq_item'].append(group.nome.mode()[0])
        monthly['most_freq_type'].append(group.tipo.mode()[0])

monthly_df = pd.DataFrame(monthly)
monthly_df

### Stats about the monthly expenditure

In [None]:
monthly_df.describe(include=np.number)

### Distribution of monthly expenses

In [None]:
sns.distplot(a=monthly_df['monthly_shopping'], kde=False)

# Yearly stats

In [None]:
yearly_gr = df.groupby(df.giorno.dt.strftime('%Y'))

yearly = {
    'year':[],
    'yearly_shopping': [], 
    'amount_per_month': [], 
    'price_mean': [],
    'price_std': [],
    'most_freq_item': [],
    'most_freq_type':[]
}

for name, group in yearly_gr:
    if len(group) > 0:
        tot = group.prezzo.sum()
        tot_per_day = tot / 4
        mean = group.prezzo.mean()
        std = group.prezzo.std()
        yearly['year'].append(name)
        yearly['yearly_shopping'].append(tot)
        yearly['amount_per_month'].append(tot_per_day)
        yearly['price_mean'].append(mean)
        yearly['price_std'].append(std)
        yearly['most_freq_item'].append(group.nome.mode()[0])
        yearly['most_freq_type'].append(group.tipo.mode()[0])

yearly_df = pd.DataFrame(yearly)
yearly_df