In [None]:
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
extraction_settings = ComprehensiveFCParameters()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import tensorflow as tf 

from pathlib import Path


import requests

import pandas_profiling
from pandas_profiling.utils.cache import cache_file

#For HTML Rendering
#from IPython.core.display import display, HTML

#folium for Map
import folium
from folium import plugins
# word cloud
from wordcloud import WordCloud

%matplotlib inline
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 1})

In [None]:
conda install -c anaconda openpyxl


In [None]:

df1=pd.read_excel("../input/online-retail/online_retail_II.xlsx", sheet_name="Year 2009-2010")
df2=pd.read_excel("../input/online-retail/online_retail_II.xlsx", sheet_name="Year 2010-2011")
data=df1.append(df2)
df=data.copy()

df.head()

In [None]:
report = df.profile_report(
    sort="None", explorative=True , html={"style": {"full_width": True}}, progress_bar=False
)
report

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#Check missing values
df.isna().sum()

In [None]:
duplicateRowsDF = df[df.duplicated()]
duplicateRowsDF.head()

In [None]:
df.duplicated().sum()

In [None]:
df.loc[df.duplicated(keep=False)]

In [None]:
df[df.duplicated() == True]

In [None]:
df=df.loc[df['Customer ID'] > 0]
df=df.loc[df['Price'] > 0]
df=df.loc[df['Quantity'] > 0]
df.head()

In [None]:
# checking the different values for Bottom country in the dataset

plt.rcParams['figure.figsize'] = (12, 10)
a = data['Country'].value_counts().tail(20)
sns.barplot(x = a.values, y = a.index, palette = 'inferno')

plt.title('Bottom 20 Countries having Online Retail Market', fontsize = 20)
plt.xlabel('Count' , fontsize = 17)
plt.ylabel('Countries' , fontsize = 17)

plt.show()

In [None]:
# checking the different values for Top 20 country in the dataset

plt.rcParams['figure.figsize'] = (12, 10)
a = data['Country'].value_counts().head(20)[1:]
sns.barplot(x = a.values, y = a.index, palette = 'inferno' )

plt.title('Top 20 Countries having Online Retail Market Except UK', fontsize = 18)
plt.xlabel('Countries', fontsize=17)
plt.ylabel('Count', fontsize=17)
plt.show()

In [None]:
# looking the stockcode for the datset
plt.rcParams['figure.figsize'] = (12, 10)
data['StockCode'].value_counts().head(20).plot.bar()
plt.title('Most Popular Stock codes', fontsize = 20)
plt.xlabel('Stockcodes', fontsize=17)
plt.ylabel('No. of units sold', fontsize=17)
plt.show()

In [None]:
# looking the stockcode for the datset

data['StockCode'].value_counts().tail(20).plot.bar()
plt.title('Most unpoular Stock codes', fontsize = 20)
plt.xlabel('Stockcodes', fontsize=17)
plt.ylabel('No. of units sold', fontsize=17)
plt.show()

In [None]:
# looking the Products sold for the datset

plt.rcParams['figure.figsize'] = (10, 8)
data['Description'].value_counts().head(20).plot.bar()
plt.title('Most Popular Products sold by description', fontsize = 20)
plt.xlabel('Products Description', fontsize=17)
plt.ylabel('No. of units sold', fontsize=17)

plt.show()

In [None]:
# let's look at Sales vs Invoicedate (Time series Analysis)

plt.rcParams['figure.figsize'] = (14, 8)
data.plot(x = 'InvoiceDate', y = 'Quantity', fontsize = 15)
plt.title("Time Series Analysis of Sales", fontsize = 20)
plt.xlabel('Date of Purchase', fontsize = 18 )
plt.ylabel('Sales', fontsize = 18)
plt.show()

In [None]:
# checking how many unique customer IDs are there

x = data['Customer ID'].nunique()

# printing the value
print("There are {} number of different customers".format(x))

In [None]:
# checking different number of unique countries present in this dataset

x = data['Country'].nunique()

# printing the result
print("There are {} number of different countries who do online retailing from UK".format(x))

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'white').generate(str(data['Description']))

print(wordcloud)
plt.rcParams['figure.figsize'] = (12, 12)
plt.axis('off')
plt.imshow(wordcloud)
plt.title('Most Occuring word in the Description list', fontsize = 20)
plt.show()

In [None]:
# checking how many quantity of products have been sold online from each country

data['Quantity'].groupby(data['Country']).agg('sum')

In [None]:
# visualizing the unitprice
plt.rcParams['figure.figsize'] = (13, 7)
sns.distplot(df['Price'], color = 'blue')
plt.title('Distribution of Unit price', fontsize = 20)
plt.xlabel('Different Unit Price for different items')
plt.ylabel('count')
plt.show()

In [None]:
Quantity = df.groupby(['InvoiceDate', 'Customer ID']).agg({'Quantity': ['sum']})
Quantity

In [None]:
Price = df.groupby(['InvoiceDate', 'Customer ID']).agg({'Price': ['sum']})
Price

In [None]:
df=pd.merge(Quantity, Price, on=['Customer ID',"InvoiceDate"], how='outer')
# rename columns
df.columns = ['Quantity', 'Price']
df.reset_index(inplace=True)

In [None]:
df.head()

In [None]:
df["Total_sales"]=df["Price"]*df["Quantity"]
df

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.tail()


In [None]:
#feature engineering
df['InvoiceDate'] =pd.to_datetime(df['InvoiceDate'])
df['Day'] = df['InvoiceDate'].dt.day
df['Month'] = df['InvoiceDate'].dt.month
df['Quarter'] = df['InvoiceDate'].dt.quarter
df['Week'] = df['InvoiceDate'].dt.week
df['Year'] = df['InvoiceDate'].dt.year
df['Day of week'] = df['InvoiceDate'].dt.dayofweek
days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}
df['Day of week'] = df['Day of week'].apply(lambda x: days[x])

In [None]:
# looking yearly sales 

plt.rcParams['figure.figsize'] = (7, 5)
df['Year'].value_counts().plot.bar()
plt.title('Yearly Slaes of Products', fontsize = 20)
plt.xlabel('Year', fontsize=17)
plt.ylabel('No. of units sold', fontsize=17)

plt.show()

In [None]:
df.head()


In [None]:
# looking the stockcode for the datset


plt.rcParams['figure.figsize'] = (10, 8)
df['Week'].value_counts().head(20).plot.bar()
plt.title('Weekly Products sold', fontsize = 20)
plt.xlabel('Week', fontsize=17)
plt.ylabel('No. of units sold', fontsize=17)

plt.show()

In [None]:
# looking for the weekly products sold 


plt.rcParams['figure.figsize'] = (10, 8)
df['Month'].plot.box()
plt.title('Weekly Products sold', fontsize = 20)
plt.xlabel('Week', fontsize=17)
plt.ylabel('No. of units sold', fontsize=17)

plt.show()

In [None]:
# looking the Day wise sales  for the datset

plt.rcParams['figure.figsize'] = (10, 8)
df['Day of week'].value_counts().plot.pie(autopct='%1.0f%%')
plt.title('Products sold on different Days  ', fontsize = 20)
plt.xlabel('Day', fontsize=17)


plt.show()

In [None]:
# looking the Quarter wise sales  for the datset

plt.rcParams['figure.figsize'] = (10, 8)
df['Quarter'].value_counts().plot.pie(autopct='%1.0f%%')
plt.title('Comparison of products sold in different Quarters  ', fontsize = 20)
plt.xlabel('Quarter', fontsize=17)


plt.show()


In [None]:
minimum_date=df['InvoiceDate'].min()
maximum_date=df['InvoiceDate'].max()

In [None]:
maximum_date
minimum_date

In [None]:
from datetime import datetime
from dateutil import relativedelta
start_date, end_date = 'December 2009', 'December 2011'
start_date, end_date = datetime.strptime(start_date, '%B %Y'), datetime.strptime(end_date, '%B %Y')
delta = relativedelta.relativedelta(end_date, start_date)
result =  [datetime.strftime(start_date + relativedelta.relativedelta(months=i), '%B %Y')\
                            for i in range(0, delta.years * 12 + delta.months + 1, 3)]

In [None]:
df.info()

In [None]:
df['Qdate'] = df['InvoiceDate'].dt.to_period("Q").dt.end_time
df.head()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df['Qdate'].unique()

In [None]:
df['Qdate'].value_counts()

In [None]:
df['Qdate'].dtype

In [None]:
profile_report = df.profile_report(
    explorative=True, html={"style": {"full_width": True}}
)
profile_report

In [None]:
df.set_index('Customer ID',inplace=True)
df.head()

In [None]:
df.head()


In [None]:
df_new=df
df_new.head(10)

In [None]:
df_q1=df.loc[df['Qdate'] == '2009-12-31 23:59:59.999999999']
df_q1.info()
df['Qdate'].value_counts()

In [None]:
df_q1.head()

In [None]:
df_q1=df.loc[df['Qdate'] == '2009-12-31 23:59:59.999999999']
df_q1.info()
group_q1 = df_q1.groupby('Customer ID')
df_q1=group_q1.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q1.head()
df_q1.columns =df_q1.columns.droplevel(0)
df_q1.rename(columns={'count': 'Orders q1'}, inplace=True)
df_q1.rename(columns={'mean': 'Average Sales q1'}, inplace=True)
df_q1.rename(columns={'sum': 'Total Sales q1'}, inplace=True)
df_q1.rename(columns={'var': 'Varience Sales q1'}, inplace=True)
df_q1 = df_q1.rename(columns = {"" : 'Customer ID'})
df_q1.head()

In [None]:
df_q11=df.loc[df['Qdate'] == '2009-12-31 23:59:59.999999999']
group_df=df_q11.groupby(['Customer ID'])
df_q11=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q11 = df_q11.fillna(0)
df_q11.columns=df_q11.columns.droplevel(0)
df_q11 = df_q11.rename(columns = {"" :'Customer ID'})
df_q11["Since_Last_Visit_q1"]=df_q11["max"]-df_q11["min"]
df_q11["Since_Last_Visit_q1"] = df_q11["Since_Last_Visit_q1"].dt.days.astype('int16')
df_q11.drop('min', axis=1, inplace=True)
df_q11.drop("max", axis=1, inplace=True)
df_q1=pd.merge(df_q1, df_q11, on='Customer ID', how='outer')
df_q1.head()

In [None]:
df_q2=df.loc[df['Qdate'] == '2010-03-31 23:59:59.999999999']
df_q2.info()
df['Qdate'].value_counts()
group_q2 = df_q2.groupby('Customer ID')
df_q2=group_q2.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q2.head()
df_q2.columns =df_q2.columns.droplevel(0)
df_q2.rename(columns={'count': 'Orders q2'}, inplace=True)
df_q2.rename(columns={'mean': 'Average Sales q2'}, inplace=True)
df_q2.rename(columns={'sum': 'Total Sales q2'}, inplace=True)
df_q2.rename(columns={'var': 'Varience Sales q2'}, inplace=True)
df_q2 = df_q2.rename(columns = {"" : 'Customer ID'})
df_q2.head()

In [None]:
df_q21=df.loc[df['Qdate'] == '2010-03-31 23:59:59.999999999']
group_df=df_q21.groupby(['Customer ID'])
df_q21=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q21 = df_q21.fillna(0)
df_q21.columns=df_q21.columns.droplevel(0)
df_q21 = df_q21.rename(columns = {"" :'Customer ID'})
df_q21["Since_Last_Visit_q2"]=df_q21["max"]-df_q21["min"]
df_q21["Since_Last_Visit_q2"] = df_q21["Since_Last_Visit_q2"].dt.days.astype('int16')
df_q21.drop('min', axis=1, inplace=True)
df_q21.drop("max", axis=1, inplace=True)
df_q2=pd.merge(df_q2, df_q21, on='Customer ID', how='outer')
df_q2.head()

In [None]:
df_q3=df.loc[df['Qdate'] == '2010-06-30 23:59:59.999999999']
df_q3.info()
df['Qdate'].value_counts()
group_q3 = df_q3.groupby('Customer ID')
df_q3=group_q3.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q3.head()
df_q3.columns =df_q3.columns.droplevel(0)
df_q3.rename(columns={'count': 'Orders q3'}, inplace=True)
df_q3.rename(columns={'mean': 'Average Sales q3'}, inplace=True)
df_q3.rename(columns={'sum': 'Total Sales q3'}, inplace=True)
df_q3.rename(columns={'var': 'Varience Sales q3'}, inplace=True)
df_q3 = df_q3.rename(columns = {"" : 'Customer ID'})
df_q3.head()

In [None]:
df_q31=df.loc[df['Qdate'] == '2010-06-30 23:59:59.999999999']
group_df=df_q31.groupby(['Customer ID'])
df_q31=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q31 = df_q31.fillna(0)
df_q31.columns=df_q31.columns.droplevel(0)
df_q31 = df_q31.rename(columns = {"" :'Customer ID'})
df_q31["Since_Last_Visit_q3"]=df_q31["max"]-df_q31["min"]
df_q31["Since_Last_Visit_q3"] = df_q31["Since_Last_Visit_q3"].dt.days.astype('int16')
df_q31.drop('min', axis=1, inplace=True)
df_q31.drop("max", axis=1, inplace=True)
df_q3=pd.merge(df_q3, df_q31, on='Customer ID', how='outer')
df_q3.head()

In [None]:
df_q4=df.loc[df['Qdate'] == '2010-09-30T23:59:59.999999999']
df_q4.info()
df['Qdate'].unique()


group_q4 = df_q4.groupby('Customer ID')
df_q4=group_q4.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q4.head()
df_q4.columns =df_q4.columns.droplevel(0)
df_q4.rename(columns={'count': 'Orders q4'}, inplace=True)
df_q4.rename(columns={'mean': 'Average Sales q4'}, inplace=True)
df_q4.rename(columns={'sum': 'Total Sales q4'}, inplace=True)
df_q4.rename(columns={'var': 'Varience Sales q4'}, inplace=True)
df_q4 = df_q4.rename(columns = {"" : 'Customer ID'})
df_q4.head()

In [None]:
df_q41=df.loc[df['Qdate'] == '2010-09-30T23:59:59.999999999']
group_df=df_q41.groupby(['Customer ID'])
df_q41=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q41 = df_q41.fillna(0)
df_q41.columns=df_q41.columns.droplevel(0)
df_q41 = df_q41.rename(columns = {"" :'Customer ID'})
df_q41["Since_Last_Visit_q4"]=df_q41["max"]-df_q41["min"]
df_q41["Since_Last_Visit_q4"] = df_q41["Since_Last_Visit_q4"].dt.days.astype('int16')
df_q41.drop('min', axis=1, inplace=True)
df_q41.drop("max", axis=1, inplace=True)
df_q4=pd.merge(df_q4, df_q41, on='Customer ID', how='outer')
df_q4.head()

In [None]:
df_q5=df.loc[df['Qdate'] == '2010-12-31T23:59:59.999999999']
df_q5.info()
df['Qdate'].unique()


group_q5 = df_q5.groupby('Customer ID')
df_q5=group_q5.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q5.head()
df_q5.columns=df_q5.columns.droplevel(0)
df_q5.rename(columns={'count': 'Orders q5'}, inplace=True)
df_q5.rename(columns={'mean': 'Average Sales q5'}, inplace=True)
df_q5.rename(columns={'sum': 'Total Sales q5'}, inplace=True)
df_q5.rename(columns={'var': 'Varience Sales q5'}, inplace=True)
df_q5 = df_q5.rename(columns = {"" : "Customer ID"})
df_q5.head()

In [None]:
df_q51=df.loc[df['Qdate'] == '2010-12-31T23:59:59.999999999']
group_df=df_q51.groupby(['Customer ID'])
df_q51=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q51 = df_q51.fillna(0)
df_q51.columns=df_q51.columns.droplevel(0)
df_q51 = df_q51.rename(columns = {"" :'Customer ID'})
df_q51["Since_Last_Visit_q5"]=df_q51["max"]-df_q51["min"]
df_q51["Since_Last_Visit_q5"] = df_q51["Since_Last_Visit_q5"].dt.days.astype('int16')
df_q51.drop('min', axis=1, inplace=True)
df_q51.drop("max", axis=1, inplace=True)
df_q5=pd.merge(df_q5, df_q51, on='Customer ID', how='outer')
df_q5.head()

In [None]:
df_q6=df.loc[df['Qdate'] == '2011-03-31T23:59:59.999999999']
df_q6.info()
df['Qdate'].unique()

group_q6 = df_q6.groupby('Customer ID')
df_q6=group_q6.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q6.head()
df_q6.columns=df_q6.columns.droplevel(0)
df_q6.rename(columns={'count': 'Orders q6'}, inplace=True)
df_q6.rename(columns={'mean': 'Average Sales q6'}, inplace=True)
df_q6.rename(columns={'sum': 'Total Sales q6'}, inplace=True)
df_q6.rename(columns={'var': 'Varience Sales q6'}, inplace=True)
df_q6 = df_q6.rename(columns = {"" : 'Customer ID'})
df_q6.head()

In [None]:
df_q61=df.loc[df['Qdate'] == '2011-03-31T23:59:59.999999999']
group_df=df_q61.groupby(['Customer ID'])
df_q61=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q61 = df_q61.fillna(0)
df_q61.columns=df_q61.columns.droplevel(0)
df_q61 = df_q61.rename(columns = {"" :'Customer ID'})
df_q61["Since_Last_Visit_q6"]=df_q61["max"]-df_q61["min"]
df_q61["Since_Last_Visit_q6"] = df_q61["Since_Last_Visit_q6"].dt.days.astype('int16')
df_q61.drop('min', axis=1, inplace=True)
df_q61.drop("max", axis=1, inplace=True)
df_q6=pd.merge(df_q6, df_q61, on='Customer ID', how='outer')
df_q6.head()

In [None]:
df_q7=df.loc[df['Qdate'] == '2011-06-30T23:59:59.999999999']
df_q7.info()
df['Qdate'].unique()

group_q7 = df_q7.groupby('Customer ID')
df_q7=group_q7.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q7.head()
df_q7.columns=df_q7.columns.droplevel(0)
df_q7.rename(columns={'count': 'Orders q7'}, inplace=True)
df_q7.rename(columns={'mean': 'Average Sales q7'}, inplace=True)
df_q7.rename(columns={'sum': 'Total Sales q7'}, inplace=True)
df_q7.rename(columns={'var': 'Varience Sales q7'}, inplace=True)
df_q7 = df_q7.rename(columns = {"" :'Customer ID'})
df_q7.head()

In [None]:
df_q71=df.loc[df['Qdate'] == '2011-06-30T23:59:59.999999999']
group_df=df_q71.groupby(['Customer ID'])
df_q71=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q71 = df_q71.fillna(0)
df_q71.columns=df_q71.columns.droplevel(0)
df_q71 = df_q71.rename(columns = {"" :'Customer ID'})
df_q71["Since_Last_Visit_q7"]=df_q71["max"]-df_q71["min"]
df_q71["Since_Last_Visit_q7"] = df_q71["Since_Last_Visit_q7"].dt.days.astype('int16')
df_q71.drop('min', axis=1, inplace=True)
df_q71.drop("max", axis=1, inplace=True)
df_q7=pd.merge(df_q7, df_q71, on='Customer ID', how='outer')
df_q7.head()

In [None]:
df_q8=df.loc[df['Qdate'] == '2011-09-30T23:59:59.999999999']
df_q8.info()
df['Qdate'].unique()


group_q8 = df_q8.groupby('Customer ID')
df_q8=group_q8.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q8.head()
df_q8.columns=df_q8.columns.droplevel(0)
df_q8.rename(columns={'count': 'Orders q8'}, inplace=True)
df_q8.rename(columns={'mean': 'Average Sales q8'}, inplace=True)
df_q8.rename(columns={'sum': 'Total Sales q8'}, inplace=True)
df_q8.rename(columns={'var': 'Varience Sales q8'}, inplace=True)
df_q8 = df_q8.rename(columns = {"" : 'Customer ID'})
df_q8.head()

In [None]:
df_q81=df.loc[df['Qdate'] == '2011-09-30T23:59:59.999999999']
group_df=df_q81.groupby(['Customer ID'])
df_q81=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q81 = df_q81.fillna(0)
df_q81.columns=df_q81.columns.droplevel(0)
df_q81 = df_q81.rename(columns = {"" :'Customer ID'})
df_q81["Since_Last_Visit_q8"]=df_q81["max"]-df_q81["min"]
df_q81["Since_Last_Visit_q8"] = df_q81["Since_Last_Visit_q8"].dt.days.astype('int16')
df_q81.drop('min', axis=1, inplace=True)
df_q81.drop("max", axis=1, inplace=True)
df_q8=pd.merge(df_q8, df_q81, on='Customer ID', how='outer')
df_q8.head()

In [None]:
df_q9=df.loc[df['Qdate'] == '2011-12-31T23:59:59.999999999']
df_q9.info()
df['Qdate'].unique()


group_q9 = df_q9.groupby('Customer ID')
df_q9=group_q9.agg({'Total_sales': {"sum","count","mean","var"}}).reset_index()
df_q9.head()
df_q9.columns=df_q9.columns.droplevel(0)
df_q9.rename(columns={'count': 'Orders q9'}, inplace=True)
df_q9.rename(columns={'mean': 'Average Sales q9'}, inplace=True)
df_q9.rename(columns={'sum': 'Total Sales q9'}, inplace=True)
df_q9.rename(columns={'var': 'Varience Sales q9'}, inplace=True)
df_q9.head()
df_q9['Churn']=0
df_q9 = df_q9.rename(columns = {"" : 'Customer ID'})
df_q9.drop('Orders q9', axis=1, inplace=True)
df_q9.drop('Varience Sales q9', axis=1, inplace=True)
df_q9.drop('Total Sales q9', axis=1, inplace=True)
df_q9.drop('Average Sales q9', axis=1, inplace=True)
#df_q9.drop('Orders q9', axis=1, inplace=True)
df_q9.head()

In [None]:
group_q0 = df.groupby('Customer ID',)
df_q0=group_q0.agg({'Total_sales': {"sum","count"}}).reset_index()
df_q0.head()
df_q0.columns.droplevel(0)
df_q0.columns=['Customer ID','Total Sales','Orders']
df_q0.head()

In [None]:
group_df=df.groupby(['Customer ID'])
df_q01=group_df.agg({'InvoiceDate':{'max','min'}}).reset_index()
df_q01 = df_q01.fillna(0)
df_q01.columns=df_q01.columns.droplevel(0)
df_q01 = df_q01.rename(columns = {"" :'Customer ID'})
df_q01["Since_Last_Visit"]=df_q01["max"]-df_q01["min"]
df_q01["Since_Last_Visit"] = df_q01["Since_Last_Visit"].dt.days.astype('int16')
df_q01.drop('min', axis=1, inplace=True)
df_q01.drop("max", axis=1, inplace=True)
df_q0=pd.merge(df_q0, df_q01, on='Customer ID', how='outer')
df_q0.head()

In [None]:
final_ads=pd.merge(df_q0, df_q1, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q2, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q3, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q4, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q5, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q6, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q7, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q8, on='Customer ID', how='outer')
final_ads=pd.merge(final_ads, df_q9, on='Customer ID', how='outer')
#final_ads['Churn']=final_ads['Churn'].fillna(1)
final_ads = final_ads.fillna(0)
final_ads.head()

In [None]:
final_ads=final_ads.loc[final_ads['Customer ID'] > 0]

In [None]:
final_ads.set_index('Customer ID', inplace=True)

In [None]:
x = final_ads['Churn'].count()

# printing the value
print("There are {} number of  customers who churned".format(x))

In [None]:
final_ads.describe()

In [None]:
final_ads = final_ads.loc[:, final_ads.var() != 0.0]
final_ads.info()

In [None]:
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]
cor.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Correlation")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [None]:
from sklearn import preprocessing 

In [None]:
final_ads_temp = final_ads.drop(final_ads.columns[0], axis=1)
final_ads_temp1 = final_ads.drop(final_ads.columns[0], axis=1)

In [None]:
scaler = preprocessing.MinMaxScaler()
scaled = scaler.fit_transform(final_ads_temp)

In [None]:
header = final_ads_temp.columns
final_ads_scaled = pd.DataFrame(scaled, columns=header)
final_ads_scaled.head()

In [None]:
# KMeans Clustering:

In [None]:
from sklearn.cluster import KMeans

In [None]:
#Optimal Value of K :

In [None]:
cluster_range = range(1,11)
cluster_errors = []

for num in cluster_range:
  clusters = KMeans(num)
  clusters.fit(final_ads_scaled)
  cluster_errors.append(clusters.inertia_)

In [None]:
final_ads_clusters = pd.DataFrame({'Num_Clusters':cluster_range, 'Cluster_Errors':cluster_errors})

In [None]:
import matplotlib.pyplot as plt          # plotting

In [None]:
plt.figure(figsize=(12,6))
plt.plot(final_ads_clusters['Num_Clusters'],final_ads_clusters['Cluster_Errors'], marker="o")
plt.axvline(x=3, linestyle='--', color= 'red')
plt.xlabel('No. of Clusters');
plt.ylabel('Error');
plt.title('Elbow Analysis for Optimal Value of Clusters');

In [None]:
kmeans = KMeans(n_clusters=3).fit(final_ads_scaled)
temp = final_ads.copy()
final_ads_scaled['cluster']=kmeans.labels_
temp['cluster']=kmeans.labels_
temp.head()

In [None]:
kmeans = KMeans(n_clusters=3).fit(final_ads_scaled)
temp = final_ads.copy()
final_ads_scaled['cluster']=kmeans.labels_
temp['cluster']=kmeans.labels_
temp.head()

In [None]:
temp['cluster'].value_counts()

In [None]:
from sklearn import metrics
metrics.silhouette_score(final_ads_scaled, kmeans.labels_)

In [None]:
## Profiling:

In [None]:
header = final_ads_scaled.columns
header

In [None]:
p_kmeans = final_ads_scaled.groupby('cluster')['Orders', 'Since_Last_Visit', 'Varience Sales q1', 'Orders q1',
       'Total Sales q1', 'Average Sales q1', 'Since_Last_Visit_q1',
       'Varience Sales q2', 'Orders q2', 'Total Sales q2', 'Average Sales q2',
       'Since_Last_Visit_q2', 'Varience Sales q3', 'Orders q3',
       'Total Sales q3', 'Average Sales q3', 'Since_Last_Visit_q3',
       'Varience Sales q4', 'Orders q4', 'Total Sales q4', 'Average Sales q4',
       'Since_Last_Visit_q4_x', 'Varience Sales q5', 'Orders q5',
       'Total Sales q5', 'Average Sales q5', 'Since_Last_Visit_q5',
       'Varience Sales q6', 'Orders q6', 'Total Sales q6', 'Average Sales q6',
       'Since_Last_Visit_q6', 'Varience Sales q7', 'Orders q7',
       'Total Sales q7', 'Average Sales q7', 'Since_Last_Visit_q7',
       'Varience Sales q8', 'Orders q8', 'Total Sales q8', 'Average Sales q8',
       'Since_Last_Visit_q8'].mean().reset_index()
p_kmeans

In [None]:
from math import pi

#Create background:-
#number of variable
categories=list(p_kmeans)[1:]
N = len(categories)

#What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

#fig size
fig_size = plt.rcParams["figure.figsize"]

fig_size[0] = 12
fig_size[1] = 7
plt.rcParams["figure.figsize"] = fig_size

#Initialise the spider plot
ax = plt.subplot(111, polar=True)

#If you want the first axis to be on top:
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)

#Draw one axe per variable & add labels labels yet
plt.xticks(angles[:-1], categories)

#Draw ylabels
ax.set_rlabel_position(0)
plt.yticks([100,200,300,400,500,600,700], ['100','200','300','400','500','600','700'], color='black', size=10)
plt.ylim(-0.5,1)

#Add plots:
# Plot each individual = each line of the data
values=p_kmeans.loc[0].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="Cluster 1: Regular Customer", marker='o', alpha=0.5)
ax.fill(angles, values, 'b', alpha=0.1)

values=p_kmeans.loc[1].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="Cluster 2: Potential Churner", marker='o', alpha=0.5)
ax.fill(angles, values, 'b', alpha=0.1)


values=p_kmeans.loc[2].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="Cluster 3: Highly Potential Churner", marker='o', alpha=0.5)
ax.fill(angles, values, 'b', alpha=0.1)

# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1));

In [None]:
# Logistic Regression

In [None]:
#Importing all necessary packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, precision_score, accuracy_score
from sklearn.metrics import recall_score, classification_report, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import numpy as np
from scipy import stats

In [None]:
temp.head()

In [None]:
temp=pd.merge(temp, df_q9, on='Customer ID', how='outer')
temp['Churn']=temp['Churn'].fillna(1)
temp = temp.fillna(0)

In [None]:
temp.head()

In [None]:
temp.set_index('Customer ID', inplace=True)

In [None]:
temp.columns

In [None]:
final_ads_scaled.head()

In [None]:
scaler = preprocessing.MinMaxScaler()
scaled = scaler.fit_transform(temp)

header = temp.columns
temp_scaled = pd.DataFrame(scaled, columns=header)
temp_scaled.head()

In [None]:
X=temp_scaled.drop('Churn', axis=1)
Y= temp_scaled['Churn']
#logit = sm.Logit(Y, X)
#result = logit.fit()
#result.summary2()
X.head()

In [None]:
#Train Test Split
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.3,random_state=0)
print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(xtrain, ytrain)

In [None]:
#Evaluation
## Predicting binary classifier
lr_pred = clf_lr.predict(xtest)

In [None]:
#Predicting prabability of 0 and 1

In [None]:
lr_pred_prb = clf_lr.predict_proba(xtest)

In [None]:
lr_pred_prb[0:5,0:5]
# First Probability is for Naegative and Second One is Negative

In [None]:
#First value in Numpy array is probability of 0 and second is probability of 1

#Only predicting and extracting probability values of 1

lr_pred_prb = clf_lr.predict_proba(xtest)[:,1]

In [None]:
# Comparison of Predicted and Actual
xtest.head()

In [None]:
xt = xtest.copy()
xt['pred'] = lr_pred
xt['pred_probability'] = lr_pred_prb
xt['actual'] = ytest
xt.head()

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(xt['actual'], xt['pred']).ravel()

In [None]:
tn, fp, fn, tp = confusion_matrix(xt['actual'], xt['pred']).ravel()
conf_matrix=pd.DataFrame({"pred_Churn":[tp,fp],"pred_Not Churn":[fn,tn]},index=["Churn","Not Churn"])
conf_matrix

In [None]:
## Accuracy
from sklearn.metrics import accuracy_score
accuracy = (tp + tn) / (tp + fp + tn + fn)
print("Accuracy: {}".format(accuracy))

In [None]:
accuracy_lr = accuracy_score(ytest,lr_pred)
print("Accuracy by built-in function: {}".format(accuracy_lr))

In [None]:
# Precision
precision_1 = tp / (tp + fp)
print("Precision for 1: {}".format(precision_1))
precision_0 = tn / (tn + fn)
print("Precision for 0: {}".format(precision_0))

In [None]:
from sklearn.metrics import precision_score
precision_lr = precision_score(ytest,lr_pred)
print("Precision by built-in function: {}".format(precision_lr))

In [None]:
#Recall
recall_1 = tp / (tp + fn)
print("Recall for 1: {}".format(recall_1))
recall_0 = tn / (tn + fp)
print("Recall for 0: {}".format(recall_0))

In [None]:
from sklearn.metrics import recall_score
recall_lr = recall_score(ytest,lr_pred)
print("Recall by built-in function: {}".format(recall_lr))

In [None]:
# F1 Score
f1_1 = (2 * precision_1 * recall_1) / (precision_1 + recall_1)
print("F1 Score for 1: {}".format(f1_1))
f1_0 = (2 * precision_0 * recall_0) / (precision_0 + recall_0)
print("F1 Score for 0: {}".format(f1_0))

In [None]:
from sklearn.metrics import f1_score
f1_lr=f1_score(ytest,lr_pred)
print("F1 Score by built-in function: {}".format(f1_lr))

In [None]:
#Class Distribution in Training Data
ytrain.value_counts()

In [None]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(ytest,lr_pred))

In [None]:
# ROC AUC Curve
#Receiver Operating Characteristic & Area Under Curve
tpr = recall_lr
fpr = fp / (fp + tn)

In [None]:
tpr, fpr

In [None]:
fpr = 1 - recall_0
tpr, fpr

In [None]:
# Defining function to plot ROC AUC Curve

def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(8,6))
    plt.title('ROC Curve')
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.005, 1, 0, 1.005])
    plt.xticks(np.arange(0,1, 0.05), rotation=90)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc='best')

In [None]:
# Getting TPR, FPR values for each threshold on ROC AUC Curve
from sklearn.metrics import roc_curve, roc_auc_score, roc_curve
fpr,tpr,threshold=roc_curve(ytest,lr_pred_prb)

In [None]:
# Calculating AUC score from ytest and predicted probabilities
auc_lr=roc_auc_score(ytest,lr_pred_prb)
auc_lr

In [None]:
#Plotting AUC ROC Curve
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set_context('poster')
plot_roc_curve(fpr,tpr,label='AUC = %0.3f'% auc_lr)

In [None]:
# Deciding Binary Classifier threshold based on Confusion Matrix (reducing Type-1 or Type-2 Error)
# Defining function for adjusted binary classification based on threshold
def adjusted_classes(pred_prob, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    Will only work for binary classification problems.
    """
    return [1 if y >= t else 0 for y in pred_prob]

In [None]:
# Defining function for precision and recall curve
def precision_recall_threshold(p, r, thresholds, t=0.5):
    """
    plots the precision recall curve and shows the current value for each
    by identifying the classifier's threshold (t).
    """
    
    # generate new class predictions based on the adjusted_classes
    # function above and view the resulting confusion matrix.
    pred_adj = adjusted_classes(lr_pred_prb, t)
    tn, fp, fn, tp = confusion_matrix(ytest, pred_adj).ravel()
    print(pd.DataFrame({"pred_Churn":[tp,fp],"pred_Not Churn":[fn,tn]},index=["Churn","Not Churn"]))
    
    print("\n Accuracy: ",(tp+tn)/(tn+fp+fn+tp)*100)
    
    # plot the curve
    plt.figure(figsize=(8,6))
    plt.title("Precision and Recall curve at current threshold")
    plt.step(r, p, color='b', alpha=0.2,
             where='post')
    plt.fill_between(r, p, step='post', alpha=0.2,
                     color='b')
    plt.ylim([-0.01, 1.01]);
    plt.xlim([-0.01, 1.01]);
    plt.xlabel('Recall');
    plt.ylabel('Precision');
    
    # plot the current threshold on the line
    close_default_clf = np.argmin(np.abs(thresholds - t))
    plt.plot(r[close_default_clf], p[close_default_clf], '^', c='k',
            markersize=15)

In [None]:
# Defining function to plot precision and recall relation with decision threshold
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds,line=0.5):
    """
    Modified from:
    Hands-On Machine learning with Scikit-Learn
    and TensorFlow; p.89
    """
    plt.figure(figsize=(8, 6))
    plt.title("Precision and Recall Scores as a function of the decision threshold")
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.axvline(x=line)
    plt.ylabel("Score")
    plt.xlabel("Decision Threshold")
    plt.legend(loc='best')
    plt.show()

In [None]:
# Calculating precision, recall and thresholds for Precision-Recall Curve
from sklearn.metrics import precision_recall_curve
p , r , thresholds = precision_recall_curve(ytest,lr_pred_prb)

In [None]:
# Ploting Precsion and Recall vs Threshold
plot_precision_recall_vs_threshold(p,r,thresholds)

In [None]:
# Decision Threshold based on Domain knowledge (reducing False Negative)
precision_recall_threshold(p,r,thresholds,0.5)