In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import folium
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt # import matplotlib
%matplotlib inline
import seaborn as sns # seaborn data visualizer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We can begin by reading in the csv file into a pandas dataframe.

In [None]:
food_waste = pd.read_csv('../input/brooklyn-food-waste/brooklyn.csv')

In [None]:
food_waste.head()

# Geographic Data Visualization #

I will use python's folium library to show the total food waste (in US dollars) in each dumpster in Brooklyn. 

In [None]:
# Create dataframe that sums the total approximate dollar value
# in the dumpster based on geographic loc as well as retailer data
df = food_waste.groupby(['collection_lat',	'collection_long'	,
                           'retailer_type',	'retailer_detail', ])['approximate_dollar_value'].sum()
df = df.reset_index()

In [None]:
loc_data = pd.DataFrame()
loc_data['latitude'] = df['collection_lat']
loc_data['longitude'] = df['collection_long']
loc_data['retailer_type'] = df['retailer_type']
loc_data['dollar_value_total'] = df['approximate_dollar_value']

In [None]:
#create a map using latitude and longitude, respectively, of general Brooklyn through a google search
map=folium.Map(location=[40.6782,-73.99447], zoom_start=12)

#create a feature group to add to the map
dumpsters = folium.map.FeatureGroup()

#create loop to add location of dumpsters to the map
for lat, lon, in zip(loc_data.latitude, loc_data.longitude):
    dumpsters.add_child(
        folium.features.CircleMarker(
            [lat,lon],
            radius= 8,
            color= 'blue',
            fill= True,
            fill_color= 'red',
            fill_opacity=0.7
        )
    )
    
#adding some text to markers
latitudes = list(loc_data.latitude)
longitudes = list(loc_data.longitude)
store_label = loc_data.retailer_type
dollar_label= loc_data.dollar_value_total.round(2)

for lat, lon, label1, label2 in zip(latitudes, longitudes, store_label,dollar_label ):
    folium.Marker([lat, lon], popup=[label1, label2]).add_to(map)

#add features to map
map.add_child(dumpsters)

# Data Visualization #

Let's try to identify some unique values in the 'label_language' column:

In [None]:
food_waste['label_language'].value_counts()

In [None]:
food_waste['retailer_type'].value_counts()

Now I will format the 'date_collected' column:

In [None]:
food_waste[['date_collected','label_date']] = food_waste[['date_collected', 'label_date']]\
.apply(pd.to_datetime, format ='%Y-%m-%d', errors='coerce')

In [None]:
food_waste.head()

In [None]:
food_waste['label_type'].value_counts()

In [None]:
food_waste['food_type'].value_counts()

I will create a new feature called 'time_elapsed' which measures the difference, in days, between the date on the label and the date that the food waste was collected. If the number of days is positive, then the food was thrown away before the date on the label.

In [None]:
food_waste['time_elapsed'] = food_waste['label_date']-food_waste['date_collected']
food_waste.time_elapsed.fillna('0 days', inplace=True)
food_waste['time_delta'] = food_waste.time_elapsed/pd.Timedelta(days=1)
food_waste['time_delta'] = food_waste['time_delta'].astype(int)

I will examine what the mean time elapsed is for each food type.

In [None]:
food_type_bar = food_waste.groupby('food_type')['time_delta'].mean()

Below is a bar chart that shows the mean time elapsed (in days) by food type. 

In [None]:
food_type_bar.plot.bar()

plt.show()

Stable shelf food appears to be thrown away far earlier than the other food types. We can look at this more deeply with what's called a "strip plot", which is essentially a scatter plot that is able to handle categorical data. 

In [None]:
plt.figure(dpi=100, figsize=(8, 5))

sns.stripplot(x=food_waste['food_type'], y=food_waste['time_delta'])

We can see that we only have four data points for the "shelf stable" food type, so any information about it is less reliable. Entries with shelf stable as a food type also have far more variance in the time elapsed between the label date and the date the food was collected.

In [None]:
retailer_type_bar = food_waste.groupby('retailer_type')['time_delta'].mean()

retailer_type_bar.plot.bar()

plt.figure(dpi=100, figsize=(8, 5)).show()

In [None]:
plt.figure(dpi=100, figsize=(10, 5))

sns.stripplot(x=food_waste['retailer_type'], y=food_waste['time_delta'])

As the plot reveals, larger retailers like drugstores and health food grocer's have a much larger, positive, time between the date on the label and the date the food waste was collected. This suggests that drug stores and health food grocers are more likely to throw food away earlier, but the data collected so far is not decisive enough. 

Next, I will employ a statistical technique called mutual information to see if the food type or retailer type actually contains any meaningful information about the time elapsed between the date on the label and the date the food was collected.

In [None]:
from sklearn.feature_selection import mutual_info_regression

# Define a function to calculate the MI scores of features
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

# Define a function to plot the MI scores of features
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


In [None]:
X = food_waste.copy()
y = X.pop('time_delta')

for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()
    
features = ['retailer_type','food_type','collection_lat','collection_long']
mi_scores = make_mi_scores(X[features],y)

In [None]:
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)