**READ ME:**

    1. CELLS IN THIS NOTEBOOK ARE INTENDED TO BE USED DEPENDING ON THE DATASET (Cosmetics/Electronics)
    2. PLEASE READ THE COMMENTS ABOVE EVERY CELL BEFORE EXECUTING/COMPILING THE CODE
    3. THIS NOTEBOOK DOES THE FOLLOWING -
    
       SECTION 0: SET UP THE NOTEBOOK
       SECTION 1: CLEAN AND PROCESS THE DATA
       SECTION 2: CREATE FEATURES BASED ON USER SESSIONS i.e. A USER'S MULTIPLE INTERACTIONS IN A GIVEN SESSION
       SECTION 3: SPLITTING DATA INTO TRAIN AND TEST
       SECTION 4: FEATURE RANKING
       SECTION 5: BALANCE THE DATA USING SMOTE
       SECTION 6: BASELINING
       SECTION 7: CREATING SAMPLES - NON STRATIFIED
       SECTION 8: CREATING SAMPLES - STRATIFIED
       SECTION 9: T-SNE
       SECTION 10: CLUSTERING

# 0. ***`SETUP`***

## i. SETTING UP NOTEBOOK

In [None]:
## This Cell is Only for Usage in a Google CoLab Environment
from google.colab import drive

In [None]:
## This Cell is Only for Usage in a Google CoLab Environment
ROOT = "/content/drive"
print(ROOT)
drive.mount(ROOT)

In [None]:
## This Cell is Only for Usage in a Google CoLab Environment
%pwd
%cd drive/My Drive/Colab Notebooks/Feature Engineering

## ii. Pre-Processing : Library and Data file import

### Libray Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)
pd.set_option('display.max_columns', 999)

In [3]:
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
import gc
import time

In [4]:
import imblearn
import collections
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where
print(imblearn.__version__)

0.7.0


In [5]:
from sklearn.utils import shuffle
from sklearn.manifold import TSNE

### Data Imports

Original CSV download

In [None]:
## Run this cell for COSMETICS data
#shoppers = pd.read_csv("cosmetics.csv")

In [None]:
## Run this cell for ELECTRONICS data
#shoppers = pd.read_csv("electronics.csv")

### Quick File Imports Below 

Run the cells below if you want to download already processed/compiled data files and skip the Feature Engineering Sections

Cleaned/Formatted DataFrame download

In [None]:
#Run this cell for Cosmetics Data ONLY
df = pd.read_pickle("dfCosmetics.pkl")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
#Run this cell for Electronics Data ONLY
df = pd.read_pickle("dfELEC.pkl")

Unbalanced Features DataFrame Import

In [None]:
#Run this cell for COSMETICS Data ONLY
FeaturesDF = pd.read_pickle("FeaturesDFCosmetics.pkl")

In [None]:
#Run this cell for ELECTRONICS Data ONLY
FeaturesDF = pd.read_pickle("FeaturesDF.pkl")

In [None]:
XData = FeaturesDF.iloc[:,:-1]
YData = FeaturesDF.iloc[:,-1]

## iii. CHECK RAM

In [None]:
#FINDING RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

In [None]:
gc.collect()

#    
************************************************
************************************************

# 1. ***`PROCESSING DATA`***
Jump to Section 3 if you have imported FeaturesDF files

In [None]:
#Run this cell for COSMETICS Data ONLY
df = shoppers.copy()

In [None]:
#Run this cell for ELECTRONICS Data ONLY
df = electronics.copy()

In [None]:
# Run this cell for COSMETICS data ONLY

df.drop('category_code',axis=1,inplace=True)
df.drop('brand',axis=1,inplace=True)

In [None]:
# Run this cell for ELECTRONICS data ONLY

df.drop('category_id',axis=1,inplace=True)

In [None]:
#Dropping rows which have headers in them
headerIndices = df[df['event_time']=='event_time'].index
df.drop(headerIndices,inplace=True)

**Formatting Date Time Column**

In [None]:
df['event_time'] = pd.to_datetime(df['event_time'],format = '%Y-%m-%d %H:%M:%S %Z')

In [None]:
#Year
df['year'] = (df['event_time']).dt.year

In [None]:
#Month
df['month'] = (df['event_time']).dt.month
MM = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
df['month'] = df['month'].map(MM)

In [None]:
# Phase of day
df['hour'] = (df['event_time']).dt.hour
timeOfDay={0:'Dawn',1:'Dawn',2:'Dawn',3:'Dawn',4:'Dawn',5:'EarlyMorning',6:'EarlyMorning',7:'EarlyMorning',8:'EarlyMorning', 9:'Morning',10:'Morning',11:'Morning', 12:'Noon', 13:'Afternoon',14:'Afternoon',15:'Afternoon',16:'Afternoon', 17:'Evening',18:'Evening',19:'Evening',20:'Evening', 21:'Night',22:'Night',23:'Night', 24:'Midnight'}
df['timeOfDay'] = df['hour'].map(timeOfDay)

In [None]:
# Weekday
dayOfWeek={0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fr', 5:'Sat', 6:'Sun'}
df['weekday'] = (df['event_time']).dt.weekday.map(dayOfWeek)

In [None]:
# Is Weekend?
df['Weekend']= df.weekday.apply(lambda x : 1 if x in ['Sunday','Saturday'] else 0)

**Formatting Other Column Types**

In [None]:
df['price']= pd.to_numeric(df['price'])
df['year']= pd.to_numeric(df['year'])
df['hour']= pd.to_numeric(df['hour'])
df['Weekend']= pd.to_numeric(df['Weekend'])

In [None]:
df['event_type'] = df['event_type'].astype(str)
df['product_id'] = df['product_id'].astype(str)
df['category_id'] = df['category_id'].astype(str)
df['product_id'] = df['product_id'].astype(str)
df['user_id'] = df['user_id'].astype(str)
df['user_session'] = df['user_session'].astype(str)
df['month'] = df['month'].astype(str)
df['weekday'] = df['weekday'].astype(str)
df['timeOfDay'] = df['timeOfDay'].astype(str)

In [None]:
df.head(3)

**SAVE TO PICKLE**

In [None]:
## Run this cell only for COSMETICS Data ONLY to Save and Load Pickle Files

#df.to_pickle("dfCosmetics.pkl")
#df = pd.read_pickle("dfCosmetics.pkl")

In [None]:
## Run this cell only for ELECTRONICS Data ONLY to Save and Load Pickle Files

#df.to_pickle("dfElec.pkl")
#df = pd.read_pickle("dfElec.pkl")

#    
************************************************
************************************************

# 2. **`CREATING SESSION BASED FEATURES`**

## Features to build:

    Weekday(Sun/Mon/...) [One hot encoded]
    Month [One hot encoded]
    TimeOfDay(Morning/Evening/....) [One hot encoded]
    Weekend
    InteractionTime(Duration of Session in seconds)
    
    NumTotalEventsInSession()
    NumViewEventsInSession
    NumCartEventsInSession
    NumRemoveEventsInSession --- Only for Cosmetics Data
    
    NumCategoriesViewedInSession
    NumCategoriesCartedInSession
    NumCategoriesRemovedInSession --- Only for Cosmetics Data
    
    NumBrandsViewedInSession --- Only for Electronics Data
    NumBrandsCartedInSession --- Only for Electronics Data
    
    NumProductsViewedInSession
    NumProductsCartedInSession
    NumProductsRemovedInSession --- Only for Cosmetics Data
    
    AverageAmountViewedInSession
    AverageAmountCartedInSession
    AverageAmountRemovedInSession --- Only for Cosmetics Data
    
    OverallAmtUserCarted  (The max amount removed by user throughout)
    OverallAmtUserViewed  (The max amount removed by user throughout)
    OverallAmtUserRemoved (The max amount removed by user throughout) --- Only for Cosmetics Data

**NOTE : THE ELECTRONICS DATASET HAS NO REMOVE FEATURE, SO USE BELOW SECTIONS ACCORDINGLY**

1. **MODIFY COMMANDS TO NOT INCLUDE THE 'remove_from_cart' FEATURE when dealing with Electronics Data**

## ADDING COLUMNS TO DF -  AMOUNT BY USER FOR DIFFERENT EVENTS

In [None]:
AmountbyUser = pd.pivot_table(df, index='user_id',columns='event_type',values='price',fill_value=0)

In [None]:
#FOR COSMETICS DATA ONLY

AmountbyUser = AmountbyUser.rename(columns={'cart':'OverallAmtUserCarted','purchase':'OverallAmtUserPurchased',
                                            'view':'OverallAmtUserViewed', 'remove_from_cart' : 'OverallAmtUserRemoved'})

In [None]:
#FOR ELECTRONICS DATA ONLY

AmountbyUser = AmountbyUser.rename(columns={'cart':'OverallAmtUserCarted','purchase':'OverallAmtUserPurchased',
                                            'view':'OverallAmtUserViewed'})

In [None]:
df = pd.merge(df,AmountbyUser,on='user_id',how='outer')

## GROUPING THE DATA

by: 'user_session'


In [None]:
GroupUS = df.sort_values('event_time').groupby(['user_session'])

## Total Number of Events/Interactions in Session

In [None]:
Features= GroupUS['event_type'].count().reset_index().rename(columns={'event_type':'TotalEventsInSession'})

In [None]:
Features = Features.set_index('user_session')
Features.head(3)

## Duration

In [None]:
# Defining function to return range of any given series
def ammd(series):
    return (max(series)-min(series))

In [None]:
interactionTime = GroupUS['event_time'].agg(ammd)
interactionTime = interactionTime.dt.total_seconds()

In [None]:
Features = Features.join(interactionTime, how='outer')
Features = Features.rename(columns={'event_time':'interactionTime'})

In [None]:
Features.head(2)

##  Number of Interactions by Event Type in Session

In [None]:
#RUN THIS CELL FOR COSMETICS DATA ONLY
NumEventTime = GroupUS['event_type'].value_counts().unstack(fill_value = 0).rename(columns={'view':'NumTimesViewedInSession','cart':'NumTimesCartedInSession',
                                                                                            'purchase':'NumTimesPurchasedInSession','remove_from_cart' : 'NumTimesRemovedInSession'})

In [None]:
#RUN THIS CELL FOR ELECTRONICS DATA ONLY
NumEventTime = GroupUS['event_type'].value_counts().unstack(fill_value = 0).rename(columns={'view':'NumTimesViewedInSession','cart':'NumTimesCartedInSession',
                                                                                            'purchase':'NumTimesPurchasedInSession'})

In [None]:
Features = Features.join(NumEventTime, how ='outer')

In [None]:
Features.head(2)

## Max and Min Price

In [None]:
Features['maxPrice'] = GroupUS['price'].max().values
Features['minPrice'] = GroupUS['price'].min().values

## AVERAGE AMOUNT BY EVENT IN SESSION

In [None]:
#RUN THIS CELL FOR COSMETICS DATA ONLY

AmountbyEvent = pd.pivot_table(df, index='user_session',columns='event_type',values='price',fill_value=0)
AmountbyEvent = AmountbyEvent.rename(columns={'cart':'AvgAmtCartedInSession','purchase':'AvgAmtPurchasedInSession',
                                              'view':'AvgAmtViewedInSession','remove_from_cart' : 'AvgAmountRemovedInSession'})

In [None]:
#RUN THIS CELL FOR ELECTRONICS DATA ONLY

AmountbyEvent = pd.pivot_table(df, index='user_session',columns='event_type',values='price',fill_value=0)
AmountbyEvent = AmountbyEvent.rename(columns={'cart':'AvgAmtCartedInSession','purchase':'AvgAmtPurchasedInSession',
                                              'view':'AvgAmtViewedInSession'})

In [None]:
Features = Features.join(AmountbyEvent,how='outer')

In [None]:
Features.head(2)

## BRAND COUNT BY EVENT IN SESSION



In [7]:
##--- RUN THIS CELL FOR THE ELECTRONICS DATA ONLY
##--- IN COSMETICS THE BRAND COLUMN HAS BEEN DROPPED DUE TO EXCESSIVE MISSING DATA

# BrandsBySession= pd.pivot_table(df, index='user_session',columns='event_type',values='brand',aggfunc='count',fill_value=0)
# BrandsBySession = BrandsBySession.rename(columns={'view':'NumBrandsViewedInSession','cart':'NumBrandsCartedInSession','purchase':'NumBrandsPurchasedInSession'})
# Features = pd.merge(Features,BrandsBySession,on='user_session',how='outer')

## CATEGORY COUNT BY EVENT IN SESSION

In [None]:
## RUN THIS CELL FOR COSMETICS DATA ONLY
## -- FOR THE COSMETICS DATA USE :values='category_id'
        
CategoriesBySession= pd.pivot_table(df, index='user_session',columns='event_type',values='category_id',aggfunc='count',fill_value=0)
CategoriesBySession = CategoriesBySession.rename(columns={'view':'NumCategoriesViewedInSession','cart':'NumCategoriesCartedInSession',
                                                          'purchase':'NumCategoriesPurchasedInSession','remove_from_cart' : 'NumCategoriesRemovedInSession'})

In [None]:
## RUN THIS CELL FOR ELECTRONICS DATA ONLY
## -- FOR THE ELECTRONICS DATA USE :values='category_code'
        
CategoriesBySession= pd.pivot_table(df, index='user_session',columns='event_type',values='category_code',aggfunc='count',fill_value=0)
CategoriesBySession = CategoriesBySession.rename(columns={'view':'NumCategoriesViewedInSession','cart':'NumCategoriesCartedInSession',
                                                          'purchase':'NumCategoriesPurchasedInSession','remove_from_cart' : 'NumCategoriesRemovedInSession'})

In [None]:
Features = Features.join(CategoriesBySession,how='outer')

In [None]:
Features.head(2)

## ADDING FEATURES FOR AVERAGE OVERALL AMOUNT BY USER FOR DIFFERENT EVENTS

NOTE THAT WE ARE CALCULATING THE MAX AMOUNT OUT OF ALL THE TRANSACTIONS TO DIFFERENTIATE BETWEEN USERS

In [None]:
OverallAmtUserCarted = GroupUS['OverallAmtUserCarted'].max()

In [None]:
OverallAmtUserViewed = GroupUS['OverallAmtUserViewed'].max()

In [None]:
OverallAmtUserPurchased = GroupUS['OverallAmtUserPurchased'].max()

In [None]:
#Merging these individual dataframes

Features = Features.join(OverallAmtUserCarted, how='left')
Features = Features.join(OverallAmtUserViewed, how='left')
Features = Features.join(OverallAmtUserPurchased, how='left')

In [6]:
#RUN THIS CELL FOR COSMETICS DATA ONLY
OverallAmtUserRemoved = GroupUS['OverallAmtUserRemoved'].max()
Features = Features.join(OverallAmtUserRemoved, how='left')

In [None]:
Features.head(2)

## PRODUCT COUNT BY EVENT IN SESSION

In [None]:
#RUN THIS CELL FOR COSMETICS DATA ONLY
ProductsBySession= pd.pivot_table(df, index='user_session',columns='event_type',values='product_id',aggfunc='count',fill_value=0)
ProductsBySession = ProductsBySession.rename(columns={'view':'NumProdsViewedInSession','cart':'NumProdsCartedInSession',
                                                      'purchase':'NumProdsPurchasedInSession','remove_from_cart' : 'NumProdsRemovedInSession'})

In [None]:
#RUN THIS CELL FOR ELECTRONICS DATA ONLY
ProductsBySession= pd.pivot_table(df, index='user_session',columns='event_type',values='product_id',aggfunc='count',fill_value=0)
ProductsBySession = ProductsBySession.rename(columns={'view':'NumProdsViewedInSession','cart':'NumProdsCartedInSession',
                                                      'purchase':'NumProdsPurchasedInSession'})

In [None]:
Features = Features.join(ProductsBySession, how='left')

In [None]:
Features.head(1)

****************************************

**ONE HOT ENCODED FEATURES START HERE**

## Adding Date and Time Features 

By using get_dummies and not pivot_table we went from ~ 365 seconds to ~25 seconds i.e. 14 times faster

In [None]:
weekday = pd.get_dummies(df['weekday']).set_index(df['user_session'])
weekday = weekday.groupby('user_session').max()

In [None]:
year = pd.get_dummies(df['year']).set_index(df['user_session'])
year = year.groupby('user_session').max()

In [None]:
month = pd.get_dummies(df['month']).set_index(df['user_session'])
month = month.groupby('user_session').max()

In [None]:
timeOfDay = pd.get_dummies(df['timeOfDay']).set_index(df['user_session'])
timeOfDay = timeOfDay.groupby('user_session').max()

In [None]:
Features = Features.join(weekday,how='outer')
Features = Features.join(year,how='outer')
Features = Features.join(month,how='outer')
Features = Features.join(timeOfDay,how='outer')

In [None]:
Weekend = GroupUS['Weekend'].sum()
Features = Features.join(Weekend,how='outer')

In [None]:
Features.head(1)

## Dropping all Purchase related Columns and adding Y label 'Purchase'

In [None]:
Features['Purchase'] = Features.NumTimesPurchasedInSession.apply(lambda x : 1 if x!=0 else 0)
Features = Features.reset_index()

In [None]:
Features.columns

**Include 'NumBrandsPurchasedInSession' below for the Electronics DataSet**

In [None]:
#RUN THIS CELL FOR COSMETICS DATA ONLY

FeaturesDF = Features.drop(columns=['user_session',
                                    'OverallAmtUserPurchased',
                                    'AvgAmtPurchasedInSession',
                                    'NumTimesPurchasedInSession',
                                    'NumCategoriesPurchasedInSession',
                                   'NumProdsPurchasedInSession'])
                           

In [None]:
#RUN THIS CELL FOR ELECTRONICS DATA ONLY

FeaturesDF = Features.drop(columns=['user_session',
                                    'OverallAmtUserPurchased',
                                    'AvgAmtPurchasedInSession',
                                    'NumTimesPurchasedInSession',
                                    'NumCategoriesPurchasedInSession',
                                   'NumProdsPurchasedInSession',
                                    'NumBrandsPurchasedInSession']) 
                           

In [None]:
FeaturesDF.sort_values(by='Purchase',ascending=False)

In [None]:
display("The features we have are as follows:", FeaturesDF.columns.tolist())

print('************')
print("PLEASE NOTE THAT \n One-Hot-Encoded Date and Time features are dynamically added as per data")
print('************')

#    
************************************************
************************************************

#3. ***`SPLITTING DATA`***

## 3a. Test-Train Split

In [None]:
#FeaturesDF.to_pickle('FeaturesDF.pkl')
#FeaturesDF = pd.read_pickle('FeaturesDF.pkl')

In [None]:
XData = FeaturesDF.iloc[:,:-1]
YData = FeaturesDF.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(XData,YData,test_size=0.2,random_state=42)

## 3b. Scaling with MinMax

In [None]:
MMscaler = MinMaxScaler()
X_train = MMscaler.fit_transform(X_train)
X_test = MMscaler.transform(X_test)

#    
************************************************
************************************************

# 4. ***`FEATURE RANKING`***

## 4a. Using Random Forest

In [None]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))

In [None]:
sel.fit(X_train, y_train)

In [None]:
sel.get_support()

In [None]:
selected_feat= XData.columns[(sel.get_support())]

In [None]:
importances = sel.estimator_.feature_importances_
indices = np.argsort(importances)[::-1] 
colname = XData.columns[indices]
plt.figure(figsize=(15,9))
plt.title("Feature importances",size=20)
sns.barplot(x=colname, y=importances[indices],palette="deep")
plt.xticks(rotation=90,size=20)
plt.show()

## 4b. Using F Scores

In [None]:
import sklearn.feature_selection as fs
kb = fs.SelectKBest(k=X_train.shape[1])
kb.fit(X_train, y_train)
names = XData.columns.values[kb.get_support()]
scores = kb.scores_[kb.get_support()]
names_scores = list(zip(names, scores))

In [None]:
fScoreDF = pd.DataFrame(data = names_scores, columns=['Feat_names','F_Scores'])
fScoreDF_sorted = fScoreDF.sort_values(['F_Scores','Feat_names'], ascending =[False, True])

In [None]:
plt.figure(figsize=(15,9))
sns.barplot(x= "Feat_names", y="F_Scores",data=fScoreDF_sorted)
plt.xticks(rotation=90,size=20)
plt.show()

#    
************************************************
************************************************

# 5. ***`SMOTE`***

#### SMOTE and PCA transformation

In [None]:
Counter(YData)

In [None]:
oversample = SMOTE()

In [None]:
X, Y = oversample.fit_resample(XData, YData)

In [None]:
Counter(Y)

In [None]:
trainX, testX, trainY, testY = train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
MMscaler = MinMaxScaler()
trainX = MMscaler.fit_transform(trainX)
testX = MMscaler.transform(testX)

In [None]:
pca = PCA(n_components=2)

In [None]:
pca.fit(trainX)

In [None]:
trainX = pca.transform(trainX)
testX = pca.transform(testX)

**Save/load pickle files for quick future usage**

In [None]:
# X.to_pickle('XDataSMOTE.pkl')
# X = pd.read_pickle('XDataSMOTE.pkl')
# Y.to_pickle('YDataSMOTE.pkl')
# Y = pd.read_pickle('YDataSMOTE.pkl')

#    
************************************************
************************************************

# 6. ***`BASELINING`***

## 6a. BASELINING --- non Balanced data

In [None]:
pca = PCA(n_components=2)

In [None]:
pca.fit(X_train)

In [None]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [None]:
logreg = LogisticRegression(solver='lbfgs', max_iter=1000,n_jobs=-1)

In [None]:
y_pred = cross_val_predict(logreg, X_train, y_train, cv=5)

In [None]:
print(metrics.accuracy_score(y_train, y_pred))

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_predFinal = logreg.predict(X_test)

In [None]:
print(metrics.accuracy_score(y_test, y_predFinal))

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_predFinal)
cnf_matrix

In [None]:
class_names=[1,0] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Greens" ,fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

## 6b. BASELINING - Balanced data

In [None]:
logreg = LogisticRegression(solver='lbfgs', max_iter=1000,n_jobs=-1)

In [None]:
predY = cross_val_predict(logreg, trainX, trainY, cv=5)

In [None]:
print(metrics.accuracy_score(trainY, predY))

In [None]:
logreg.fit(trainX, trainY)

In [None]:
predFinalY = logreg.predict(testX)

In [None]:
print(metrics.accuracy_score(testY, predFinalY))

In [None]:
cnf_matrix_SMOTE = metrics.confusion_matrix(testY, predFinalY)
cnf_matrix_SMOTE

In [None]:
# name  of classes
class_names=[0,1] 
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names,size=20)
plt.yticks(tick_marks, class_names,size=20)

# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix_SMOTE), annot=True, cmap="Blues" ,fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix SMOTE',size = 20)
plt.ylabel('Actual label SMOTE ',size = 20)
plt.xlabel('Predicted label SMOTE',size = 20)

In [None]:
print("Accuracy:",metrics.accuracy_score(testY, predFinalY))
print("Precision:",metrics.precision_score(testY, predFinalY))
print("Recall:",metrics.recall_score(testY, predFinalY))

#    
************************************************
************************************************

# 7. ***`SUBSETTING DATA/CREATING SAMPLES`***

**NON STRATIFIED**

    FeaturesDF - All Features
    XData, YData - FeaturesDF split into x and y
    X,Y - SMOTE of XData and YData
    XSelect,YSelect- XData with select features
    XSample,YSample - XSelect Sliced upto 'n' rows non stratified

In [None]:
XData.shape

In [None]:
YData.shape

In [None]:
# FOR COSMETICS DATA

SelectFeatures = ['TotalEventsInSession', 'interactionTime',
                  'NumTimesCartedinSession','NumTimesRemovedinSession','NumTimesViewedinSession',
                         'maxPrice',    'minPrice',         
            'AvgAmtCartedInSession','AvgAmountRemovedinSession',    'AvgAmtViewedInSession',
       'NumCategoriesViewedinSession', 'NumCategoriesCartedinSession', 'NumCategoriesRemovedinSession',
                  'OverallAmtUserCarted','OverallAmtUserViewed',         'OverallAmtUserRemoved',
             'NumProdsViewedinSession', 'NumProdsCartedInSession',      'NumProdsRemovedinSession']

In [None]:
# FOR ELECTRONICS DATA

SelectFeatures = ['TotalEventsInSession', 'interactionTime',
                  'NumTimesCartedinSession','NumTimesViewedinSession',             
                         'maxPrice',    'minPrice',         
           'AvgAmtCartedInSession',  'AvgAmtViewedInSession',
        'NumCategoriesCartedinSession', 'NumCategoriesViewedinSession',
                  'OverallAmtUserCarted','OverallAmtUserViewed',
             'NumProdsCartedInSession',   'NumProdsViewedInSession'   ,
                 'NumBrandsCartedInSession',   'NumBrandsViewedInSession'    ]

Subsets for clustering

In [None]:
#Enter the desired subset size

Subset_size = 50000  

In [None]:
MMscaler = MinMaxScaler()
XSelect = XData[SelectFeatures]
XSample = XSelect[:Subset_size].iloc[:,:]
SampleCol = XSample.columns

XSample = MMscaler.fit_transform(XSample)
XSample = pd.DataFrame(XSample,columns=SampleCol)

In [None]:
YSample = YData[:Subset_size]

In [None]:
YSample.value_counts()

In [None]:
display(XSample.head())
display(YSample.head())

#    
************************************************
************************************************

# 8. ***`STRATIFIED SAMPLING`***

XSampleSS - Stratified X Sample

YSampleSS - Stratified Y Sample


## Generating Sample

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=10, test_size = 0.009, random_state=42)


In [None]:
for train_index, test_index in sss.split(XData, YData):
    XSSS_Train = XData.loc[train_index]
    XSSS_Test = XData.loc[test_index]
    YSSS_Train = YData.loc[train_index]
    YSSS_Test = YData.loc[test_index]


In [None]:
XSampleSS = XSSS_Test
print(XSampleSS.shape)
YSampleSS = YSSS_Test
print(YSampleSS.shape)

In [None]:
# FOR COSMETICS DATA

SelectFeatures = ['TotalEventsInSession', 'interactionTime',
                  'NumTimesCartedinSession','NumTimesRemovedinSession','NumTimesViewedinSession',
                         'maxPrice',    'minPrice',         
            'AvgAmtCartedInSession','AvgAmountRemovedinSession',    'AvgAmtViewedInSession',
       'NumCategoriesViewedinSession', 'NumCategoriesCartedinSession', 'NumCategoriesRemovedinSession',
                  'OverallAmtUserCarted','OverallAmtUserViewed',         'OverallAmtUserRemoved',
             'NumProdsViewedinSession', 'NumProdsCartedInSession',      'NumProdsRemovedinSession']

In [None]:
# FOR ELECTRONICS DATA

SelectFeatures = ['TotalEventsInSession', 'interactionTime',
                  'NumTimesCartedinSession','NumTimesViewedinSession',             
                         'maxPrice',    'minPrice',         
           'AvgAmtCartedInSession',  'AvgAmtViewedInSession',
        'NumCategoriesCartedinSession', 'NumCategoriesViewedinSession',
                  'OverallAmtUserCarted','OverallAmtUserViewed',
             'NumProdsCartedInSession',   'NumProdsViewedInSession'   ,
                 'NumBrandsCartedInSession',   'NumBrandsViewedInSession'    ]

In [None]:
XSampleSS = XSampleSS[SelectFeatures]

In [None]:
XSampleSS = XSampleSS.iloc[:,:]
SampleColSS = XSampleSS.columns

In [None]:
XSampleSS = MMscaler.fit_transform(XSampleSS)
XSampleSS = pd.DataFrame(XSampleSS,columns=SampleColSS)

In [None]:
XSampleSS.head()

In [None]:
YSampleSS.values

In [None]:
YSampleSS.value_counts()

#    
************************************************
************************************************

# 9. ***`T-SNE`***

## T-SNE FUNCTIONS SETUP

In [None]:
def generate_tsne(sample_X, sample_Y, perp):
    """
    Parameter : Perplexity (5 to 50, increase with density)
    """
    tsne = TSNE(n_components=2, perplexity = perp, n_iter = 300, learning_rate=100)
    tsne_results = tsne.fit_transform(sample_X)
    df_tsne = pd.DataFrame(tsne_results, columns=['comp1','comp2'])
    df_tsne['label'] = sample_Y.values
    return df_tsne

In [None]:
def viz_TSNE(df_tsne):
    plt.figure(figsize=(9,6))
    sns.lmplot(x='comp1', y='comp2', data = df_tsne, hue='label', fit_reg=False)
    plt.show()

## **TSNE ITERATIONS**

## Trial 1

Using a Stratified Sample

(n_components=2, perplexity = perp, n_iter = 1000, learning_rate=100)

Sample Size ~ 56k


### 1.1
Perplexity = 10

In [None]:
dfSS10 = generate_tsne(XSampleSS, YSampleSS, 10)

In [None]:
viz_TSNE(dfSS10)

### 1.2
Perplexity = 30

In [None]:
dfSS30= generate_tsne(XSampleSS, YSampleSS, 30)

In [None]:
viz_TSNE(dfSS30)

### 1.2
Perplexity = 50

In [None]:
dfSS50= generate_tsne(XSampleSS, YSampleSS, 50)

In [None]:
viz_TSNE(dfSS50)

#    
************************************************
************************************************

# 10. ***`CLUSTERING`***

## Visualizing PCA of entire Data | PRE - CLUSTERING

In [None]:
X_principal = pd.read_pickle("X_principal.pkl")

In [None]:
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')

xs = X_principal['P1']
ys = X_principal['P2']
zs = Y
ax.scatter(xs, ys, zs, s=50, alpha=0.6, edgecolors='w')

ax.set_xlabel('P1')
ax.set_ylabel('P2')
ax.set_zlabel('Label')

## DBSCAN FUNCTION

In [None]:
def dbscan(X, eps, min_samples):
    X = X.iloc[:,:].values
    db = DBSCAN(eps=eps, min_samples=min_samples)
    db.fit(X)
    y_pred = db.fit_predict(X)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    plt.scatter(X[:,0], X[:,1],c=y_pred, cmap='Paired')
    plt.title("DBSCAN")
    print('Estimated number of clusters: %d' % n_clusters_)

####  *Finding Optimal Eps value for DBSCAN*

In [None]:
NN = NearestNeighbors(n_neighbors=100)
nbrs = NN.fit(XSampleSS)

In [None]:
distances, indices = nbrs.kneighbors(XSampleSS)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
plt.ylim([0,1])
plt.show()

## CLUSTERING after PCA

### Number of records 50k

In [None]:
#dbscan(Xf,0.03,1000)

In [None]:
#dbscan(Xf,0.05,1000)

In [None]:
#dbscan(Xf,0.05,100)

In [None]:
#dbscan(Xf,0.01,100)

## CLUSTERING after TSNE

In [None]:
dfX = dfSS30.loc[:,:'comp2']
dfX.head(1)

In [None]:
dbscan(dfX, 0.05, 100)

#    
************************************************
************************************************