In [None]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Reading the groceries data
df_groc=pd.read_csv('/kaggle/input/groceries-dataset/Groceries_dataset.csv')
df_groc.head()

# Exploratory data analysis (EDA)

In statistics, exploratory data analysis is an approach to analyzing data sets to summarize their main characteristics, often with visual methods. A statistical model can be used or not, but primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing task.

**Source Wikipedia**  
[https://en.wikipedia.org/wiki/Exploratory_data_analysis]

Checking Null Values

In [None]:
df_groc.isna().sum()

In [None]:
#Total different products sold in the Grocery Store
print(len(df_groc['itemDescription'].unique()))
#shape of the Dataset
df_groc.shape

In [None]:
#Arranging the dataset to see the most bought product in the given time
most_bought=df_groc.drop(['Member_number','Date'],axis=1)
most_bought=most_bought.value_counts()

#Taking the TOP 15 products
top15=most_bought.head(15)

#Taking the BOTTOM 15 products
bo15=most_bought.tail(15)
bo15=bo15.sort_values(ascending=True)

In [None]:
#Style of plotting 
import matplotlib as mpl
mpl.style.use('dark_background')

#Plotting the bar chart for the most bought products
import matplotlib.pyplot as plt

top15.plot(kind='bar',figsize=(15,6),rot=90,color='green')

plt.title('THE TOP 15 PRODUCTS BOUGHT IN GROCERY STORE',color='Blue',fontsize=25)
plt.xlabel('PRODUCT',fontsize=25)
plt.ylabel('QUANTITY',fontsize=25)
plt.rc('xtick', labelsize=25) 
plt.rc('ytick', labelsize=20) 

#LABELING
for index, value in enumerate(top15): 
    plt.annotate(value, xy=(index-0.4,value),color='white',fontsize=20)


In [None]:
#Plotting the bar chart for the least bought products

bo15.plot(kind='bar',figsize=(15,6),rot=90,color='orange')

plt.title('THE BOTTOM 15 PRODUCTS BOUGHT IN GROCERY STORE',color='red',fontsize=25)
plt.xlabel('PRODUCT',fontsize=25)
plt.ylabel('QUANTITY',fontsize=25)
plt.rc('xtick', labelsize=25) 
plt.rc('ytick', labelsize=20) 

#LABELING
for index, value in enumerate(bo15): 
    plt.annotate(value, xy=(index-0.1,value),color='white',fontsize=20)

We see the distribution above and get intuition about the frequency of various groceries


__________________________________________________________________________________________________________________________________

Lets make a word cloud of the products for better picture

In [None]:
# install wordcloud
try:  
      !conda install -c conda-forge wordcloud==1.4.1 --yes
except:
      print('installed')  
  

# import package and its set of stopwords
from wordcloud import WordCloud, STOPWORDS

print ('Wordcloud is installed and imported!')

In [None]:
# converting images into arrays for our mask
from PIL import Image 

# Image for the masking of word cloud
!wget --quiet ../input/maskforgrocery/shopping-cart-png-5a364b6d3217e8.4884266315135076932052.jpg
    
# saving mask
gro_mask = np.array(Image.open('../input/maskforgrocery/shopping-cart-png-5a364b6d3217e8.4884266315135076932052.jpg'))
    
print('Image downloaded and saved!')

In [None]:
#Stop words are to avoid the words to be present on the chart
stopwords = set(STOPWORDS)

In [None]:
#Lets select a basket image for our word cloud
fig = plt.figure()
fig.set_figwidth(14) # set width
fig.set_figheight(18) # set height

plt.imshow(gro_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#we will make a word list to be passed in word cloud
word_list=df_groc['itemDescription']
#Convert in string
word_string = ' '.join([str(x) for x in word_list])


In [None]:
# instantiate a word cloud object
wc = WordCloud(background_color='white', max_words=2000, mask=gro_mask, stopwords=stopwords)

# generate the word cloud
wc.generate(word_string)

# display the word cloud
fig = plt.figure()
fig.set_figwidth(18) # set width
fig.set_figheight(22) # set height
import random

#Function reference https://amueller.github.io/word_cloud/auto_examples/a_new_hope.html

def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(1, 2)

plt.imshow(wc.recolor(color_func=grey_color_func, random_state=1), interpolation='bilinear')
plt.axis('off')
plt.show()

Above gives an overall idea of the most bought grocery products by the size of the words

In [None]:
date=df_groc.sort_values(by='Date')
date.reset_index()
date.drop(columns=['Member_number'],inplace=True)
date.tail()
#As we see the data is from 1-1-2014 to 31-10-2015

LETS SORT THE DATE BY MONTH YEAR AND THE DAY OF THE WEEK

In [None]:
#Create a Dataframe named df_gsort to save the sorted list
df_gsort=df_groc
df_gsort.Date = pd.to_datetime(df_groc.Date) 
df_gsort['Year'] = df_gsort.Date.apply(lambda x : x.year)
df_gsort['Month'] = df_gsort.Date.apply(lambda x : x.month)
#BY THIS WE CAN GET TO KNOW WHICH IS THE DAY OF WEEK
df_gsort['Days of Week'] = df_gsort.Date.apply(lambda x : x.dayofweek)
df_gsort['Days of Week']=df_gsort['Days of Week'].apply(lambda x : x+1)
df_gsort.head()

In [None]:
import seaborn as sns
sns.countplot(df_gsort.Year)
plt.show()

In [None]:
plt.figure(figsize=(15,7.5))
sns.countplot(df_gsort.Month,hue=df_gsort.Year,orient = 'v',color='purple')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
df_gsort.groupby('Month').count().plot(kind='line',figsize=(12,6),legend=False)
plt.xticks([i for i in range(1,13)])
plt.xlabel('Month of the year',fontsize=25)
plt.show()


 WE CAN SAY IN THE MONTH OF AUGUST THE GROCERY STORE IS IN HIGH BUSINESS

In [None]:
df_gsort.groupby('Days of Week').count().plot(kind='line',figsize=(12,6),legend=False)
plt.xticks([i for i in range(1,8)])
plt.xlabel('Day of the week',fontsize=25)
plt.show()

 WE CAN SAY ON THURSDAY THE GROCERY STORE IS IN HIGH BUSINESS