# Model Analysis
### This is an analysis on Walmart dataset obtained from kaggle repository.

This analysis is as follows. 
1. Feature Engineering.
2. Data pre-processing
3. Test and Train data (Split)

Machine Leaning Models used
1. K Nearest Neighbours Classifier (KNN)
2. Logistic Regression Classifier(LR)
3. Naive Bayes Classifier(NB)
4. Decision Tree Classifier(DT)
5. SVM Classifier(SVM)
6. Random Forest Classifier(RF)


### All the above machine learning classifiers will have reported based on 2 key metrices
1. classification accuracy(For both train and test set)
2. classification report(For test set only)

In [1]:
#importing all essential libraries
import pandas as pd
import numpy as np
import re 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
%matplotlib inline

In [2]:
#importing the dataset 
data = pd.read_csv("train.csv")

In [3]:
#checking rows and columns in the data
data.shape

(647054, 7)

## 1. Feature Engineering

In [4]:
features = ["Weekday", "NumItems", "Return", 'ACCESSORIES',
       'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER', 'BEAUTY', 'BEDDING',
       'BOOKS AND MAGAZINES', 'BOYS WEAR', 'BRAS & SHAPEWEAR',
       'CAMERAS AND SUPPLIES', 'CANDY, TOBACCO, COOKIES', 'CELEBRATION',
       'COMM BREAD', 'CONCEPT STORES', 'COOK AND DINE', 'DAIRY', 'DSD GROCERY',
       'ELECTRONICS', 'FABRICS AND CRAFTS', 'FINANCIAL SERVICES',
       'FROZEN FOODS', 'FURNITURE', 'GIRLS WEAR, 4-6X  AND 7-14',
       'GROCERY DRY GOODS', 'HARDWARE', 'HOME DECOR',
       'HOME MANAGEMENT', 'HORTICULTURE AND ACCESS',
       'HOUSEHOLD CHEMICALS/SUPP', 'HOUSEHOLD PAPER GOODS',
       'IMPULSE MERCHANDISE', 'INFANT APPAREL', 'INFANT CONSUMABLE HARDLINES',
       'JEWELRY AND SUNGLASSES', 'LADIES SOCKS', 'LADIESWEAR',
       'LARGE HOUSEHOLD GOODS', 'LAWN AND GARDEN', 'LIQUOR,WINE,BEER',
       'MEAT - FRESH & FROZEN', 'MEDIA AND GAMING',  'MENSWEAR',
       'OFFICE SUPPLIES', 'OPTICAL - FRAMES', 'OPTICAL - LENSES',
       'OTHER DEPARTMENTS', 'PAINT AND ACCESSORIES', 'PERSONAL CARE',
       'PETS AND SUPPLIES', 'PHARMACY OTC', 'PHARMACY RX',
       'PLAYERS AND ELECTRONICS', 'PLUS AND MATERNITY', 'PRE PACKED DELI',
       'PRODUCE', 'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY',
       'SHOES', 'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS',
       'SWIMWEAR/OUTERWEAR', 'TOYS', 'WIRELESS']
#Necessary feature building
def transform_my_data(data):
    
    dummies = pd.get_dummies(data.DepartmentDescription)
    data[dummies.columns] = dummies 
    data_dummies = data.iloc[:,7:]
    data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])
    data_dummies = data_dummies.replace(-0,0)

    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    data = data[["TripType", "VisitNumber", "Weekday", "ScanCount", "Return"]]
    data = data.rename(columns={"ScanCount":"NumItems"})
    data = pd.concat([data, data_dummies], axis=1)
    grouped = data.groupby("VisitNumber")
    grouped = grouped.agg({'Weekday': np.max, "TripType": np.max, 'NumItems': np.sum, 'Return': np.max, 
              '1-HR PHOTO': np.sum, 'ACCESSORIES': np.sum,
       'AUTOMOTIVE': np.sum, 'BAKERY': np.sum, 'BATH AND SHOWER': np.sum, 'BEAUTY': np.sum, 'BEDDING': np.sum,
       'BOOKS AND MAGAZINES': np.sum, 'BOYS WEAR': np.sum, 'BRAS & SHAPEWEAR': np.sum,
       'CAMERAS AND SUPPLIES': np.sum, 'CANDY, TOBACCO, COOKIES': np.sum, 'CELEBRATION': np.sum,
       'COMM BREAD': np.sum, 'CONCEPT STORES': np.sum, 'COOK AND DINE': np.sum, 'DAIRY': np.sum, 'DSD GROCERY': np.sum,
       'ELECTRONICS': np.sum, 'FABRICS AND CRAFTS': np.sum, 'FINANCIAL SERVICES': np.sum,
       'FROZEN FOODS': np.sum, 'FURNITURE': np.sum, 'GIRLS WEAR, 4-6X  AND 7-14': np.sum,
       'GROCERY DRY GOODS': np.sum, 'HARDWARE': np.sum, 'HEALTH AND BEAUTY AIDS': np.sum, 'HOME DECOR': np.sum,
       'HOME MANAGEMENT': np.sum, 'HORTICULTURE AND ACCESS': np.sum,
       'HOUSEHOLD CHEMICALS/SUPP': np.sum, 'HOUSEHOLD PAPER GOODS': np.sum,
       'IMPULSE MERCHANDISE': np.sum, 'INFANT APPAREL': np.sum, 'INFANT CONSUMABLE HARDLINES': np.sum,
       'JEWELRY AND SUNGLASSES': np.sum, 'LADIES SOCKS': np.sum, 'LADIESWEAR': np.sum,
       'LARGE HOUSEHOLD GOODS': np.sum, 'LAWN AND GARDEN': np.sum, 'LIQUOR,WINE,BEER': np.sum,
       'MEAT - FRESH & FROZEN': np.sum, 'MEDIA AND GAMING': np.sum,'MENSWEAR': np.sum,
       'OFFICE SUPPLIES': np.sum, 'OPTICAL - FRAMES': np.sum, 'OPTICAL - LENSES': np.sum,
       'OTHER DEPARTMENTS': np.sum, 'PAINT AND ACCESSORIES': np.sum, 'PERSONAL CARE': np.sum,
       'PETS AND SUPPLIES': np.sum, 'PHARMACY OTC': np.sum, 'PHARMACY RX': np.sum,
       'PLAYERS AND ELECTRONICS': np.sum, 'PLUS AND MATERNITY': np.sum, 'PRE PACKED DELI': np.sum,
       'PRODUCE': np.sum, 'SEAFOOD': np.sum, 'SEASONAL': np.sum, 'SERVICE DELI': np.sum, 'SHEER HOSIERY': np.sum,
       'SHOES': np.sum, 'SLEEPWEAR/FOUNDATIONS': np.sum, 'SPORTING GOODS': np.sum,
       'SWIMWEAR/OUTERWEAR': np.sum, 'TOYS': np.sum, 'WIRELESS': np.sum})
    data = grouped[["TripType", "Weekday", "NumItems", "Return",'1-HR PHOTO', 'ACCESSORIES',
           'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER', 'BEAUTY', 'BEDDING',
           'BOOKS AND MAGAZINES', 'BOYS WEAR', 'BRAS & SHAPEWEAR',
           'CAMERAS AND SUPPLIES', 'CANDY, TOBACCO, COOKIES', 'CELEBRATION',
           'COMM BREAD', 'CONCEPT STORES', 'COOK AND DINE', 'DAIRY', 'DSD GROCERY',
           'ELECTRONICS', 'FABRICS AND CRAFTS', 'FINANCIAL SERVICES',
           'FROZEN FOODS', 'FURNITURE', 'GIRLS WEAR, 4-6X  AND 7-14',
           'GROCERY DRY GOODS', 'HARDWARE', 'HEALTH AND BEAUTY AIDS', 'HOME DECOR',
           'HOME MANAGEMENT', 'HORTICULTURE AND ACCESS',
           'HOUSEHOLD CHEMICALS/SUPP', 'HOUSEHOLD PAPER GOODS',
           'IMPULSE MERCHANDISE', 'INFANT APPAREL', 'INFANT CONSUMABLE HARDLINES',
           'JEWELRY AND SUNGLASSES', 'LADIES SOCKS', 'LADIESWEAR',
           'LARGE HOUSEHOLD GOODS', 'LAWN AND GARDEN', 'LIQUOR,WINE,BEER',
           'MEAT - FRESH & FROZEN', 'MEDIA AND GAMING', 'MENSWEAR',
           'OFFICE SUPPLIES', 'OPTICAL - FRAMES', 'OPTICAL - LENSES',
           'OTHER DEPARTMENTS', 'PAINT AND ACCESSORIES', 'PERSONAL CARE',
           'PETS AND SUPPLIES', 'PHARMACY OTC', 'PHARMACY RX',
           'PLAYERS AND ELECTRONICS', 'PLUS AND MATERNITY', 'PRE PACKED DELI',
           'PRODUCE', 'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY',
           'SHOES', 'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS',
           'SWIMWEAR/OUTERWEAR', 'TOYS', 'WIRELESS']]
    return data
data=transform_my_data(data)

In [5]:
#checking rows and columns in the data after feature building
data.shape

(95674, 71)

## 2. Test and Train Split

In [7]:
#Transforming qualitative variables into quantitative (Data Pre-processing)
def transform(day,no):
    global data
    data=data.replace(day,no)
    return;
transform("Monday", 1)
transform("Tuesday", 2)
transform("Wednesday", 3)
transform("Thursday", 4)
transform("Friday", 5)
transform("Saturday", 6)
transform("Sunday", 7)
#Removing redundant data from department description column
data=data.replace("MENS WEAR","MENSWEAR")

In [8]:
#splitting data into test and train
train, test = train_test_split(data, test_size = 0.2)
#Dropping na values(Data pre-processsing)
test=test.dropna()
train=train.dropna()
train = train

In [9]:
train.T.apply(lambda x: x.nunique(),axis=1)

TripType                       38
Weekday                         7
NumItems                      137
Return                          2
1-HR PHOTO                     11
ACCESSORIES                    14
AUTOMOTIVE                     22
BAKERY                         17
BATH AND SHOWER                26
BEAUTY                         31
BEDDING                        20
BOOKS AND MAGAZINES            11
BOYS WEAR                      24
BRAS & SHAPEWEAR               15
CAMERAS AND SUPPLIES            7
CANDY, TOBACCO, COOKIES        29
CELEBRATION                    39
COMM BREAD                     20
CONCEPT STORES                  3
COOK AND DINE                  31
DAIRY                          32
DSD GROCERY                    41
ELECTRONICS                    12
FABRICS AND CRAFTS             33
FINANCIAL SERVICES             15
FROZEN FOODS                   29
FURNITURE                      13
GIRLS WEAR, 4-6X  AND 7-14     24
GROCERY DRY GOODS              62
HARDWARE      

In [10]:
train

Unnamed: 0_level_0,TripType,Weekday,NumItems,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27402,999,2,0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56513,15,7,3,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
148165,9,7,3,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103874,7,7,4,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91361,6,5,2,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55909,8,6,2,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23707,30,1,2,0.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
64194,36,1,6,0.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
77486,43,3,6,0.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
18294,8,7,1,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#Grouping TripType based on Weekday values
def get_count(values):
    return len(values)
grouped_count = train.groupby(['TripType',"Weekday"]).TripType.agg(get_count)
grouped_count

TripType  Weekday
3         1           349
          2           379
          3           366
          4           312
          5           629
          6           495
          7           375
4         1            31
          2            34
          3            34
          4            34
          5            56
          6            54
          7            36
5         1           460
          2           486
          3           532
          4           524
          5           603
          6           582
          7           452
6         1           107
          2           109
          3           127
          4           109
          5           180
          6           230
          7           133
7         1           627
          2           619
                     ... 
41        6           105
          7           111
42        1           152
          2           166
          3           166
          4           173
          5         

### Q3. How can retailers identify the right customers for cross-selling goods in the near future?

###  1. KNN

In [12]:
#Using KNeighborsClassifier with default parameters 
knn = KNeighborsClassifier(n_neighbors=5)
knn = knn.fit(np.asarray(train[features]),np.asarray(train.TripType))

In [13]:
train_prediction = knn.predict(np.asarray(train[features]))
test_prediction = knn.predict(np.asarray(test[features]))

In [15]:
#Accuracy Train and test
print("Accuracy for K Nearesr Neighbors - ")
print("Train set :", accuracy_score(train.TripType,train_prediction))
print("Test set  :", accuracy_score(test.TripType,test_prediction))

Accuracy for K Nearesr Neighbors - 
Train set : 0.7031839977005186
Test set  : 0.6255030049647243


In [17]:
print(classification_report(test["TripType"],test_prediction))

              precision    recall  f1-score   support

           3       0.77      0.90      0.83       738
           4       0.09      0.04      0.06        67
           5       0.70      0.81      0.75       954
           6       0.67      0.63      0.65       282
           7       0.58      0.64      0.61      1181
           8       0.66      0.85      0.74      2391
           9       0.63      0.75      0.68      1894
          12       0.12      0.06      0.08        53
          14       0.00      0.00      0.00         1
          15       0.42      0.39      0.40       174
          18       0.33      0.32      0.32       101
          19       0.32      0.20      0.24        90
          20       0.62      0.55      0.58       126
          21       0.52      0.59      0.55       117
          22       0.47      0.22      0.30       177
          23       0.44      0.17      0.24        24
          24       0.57      0.46      0.51       526
          25       0.44    

  'precision', 'predicted', average, warn_for)


### 2. LR

In [19]:
#Logistic Regression
train.TripType = train.TripType.astype(float)
train.Weekday = train.Weekday.astype(float)  
test.TripType = test.TripType.astype(float)
test.Weekday = test.Weekday.astype(float)  

In [21]:
lr = LogisticRegression()
lr.fit(X = np.asarray(train[features]), y = np.asarray(train.TripType))



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
train_prediction = lr.predict(np.asarray(train[features]))

In [23]:
test_prediction = lr.predict(np.asarray(test[features]))

In [25]:
print("Accuracy for Logistic Regression -")
print("Train set :", accuracy_score(train.TripType,train_prediction))
print("Test set  :", accuracy_score(test.TripType,test_prediction))

Accuracy for Logistic Regression -
Train set : 0.6375311932478867
Test set  : 0.6347008100339692


In [26]:
pd.crosstab(test.TripType,test_prediction, rownames=['True'], colnames=['Predicted'], margins=True)
print(classification_report(test["TripType"], test_prediction))

              precision    recall  f1-score   support

         3.0       0.75      0.94      0.84       738
         4.0       0.00      0.00      0.00        67
         5.0       0.75      0.66      0.70       954
         6.0       0.67      0.47      0.55       282
         7.0       0.64      0.65      0.65      1181
         8.0       0.58      0.90      0.71      2391
         9.0       0.63      0.78      0.70      1894
        12.0       0.50      0.06      0.10        53
        14.0       0.00      0.00      0.00         1
        15.0       0.52      0.32      0.39       174
        18.0       0.42      0.31      0.35       101
        19.0       0.45      0.06      0.10        90
        20.0       0.62      0.64      0.63       126
        21.0       0.59      0.48      0.53       117
        22.0       0.45      0.29      0.35       177
        23.0       0.00      0.00      0.00        24
        24.0       0.60      0.57      0.58       526
        25.0       0.67    

  'precision', 'predicted', average, warn_for)


### 3. NB


In [28]:
naive_bayes = GaussianNB()
naive_bayes  = naive_bayes.fit(np.asarray(train[features]), np.asarray(train.TripType))

In [29]:
train_prediction = naive_bayes.predict(np.asarray(train[features]))
test_prediction = naive_bayes.predict(np.asarray(test[features]))

In [31]:
#Accuracy of naive Bayes
print("Accuracy for Naive Bayes -")
print("Train set :", accuracy_score(train.TripType,train_prediction))
print("Test set  :", accuracy_score(test.TripType,test_prediction))

Accuracy for Naive Bayes -
Train set : 0.10780125164948588
Test set  : 0.10697674418604651


In [32]:
print(classification_report(test["TripType"], test_prediction))

              precision    recall  f1-score   support

         3.0       0.95      0.03      0.05       738
         4.0       0.01      0.24      0.01        67
         5.0       0.90      0.12      0.22       954
         6.0       0.13      0.53      0.20       282
         7.0       0.56      0.10      0.16      1181
         8.0       0.41      0.01      0.01      2391
         9.0       0.55      0.00      0.01      1894
        12.0       0.02      0.40      0.03        53
        14.0       0.00      0.00      0.00         1
        15.0       0.15      0.09      0.11       174
        18.0       0.06      0.33      0.10       101
        19.0       0.04      0.11      0.06        90
        20.0       0.18      0.75      0.29       126
        21.0       0.04      0.38      0.08       117
        22.0       0.25      0.16      0.20       177
        23.0       0.00      0.58      0.01        24
        24.0       0.56      0.21      0.30       526
        25.0       0.61    

### 4. DT

In [34]:
# Decision Tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(np.asarray(train[features]), np.asarray(train.TripType))

In [36]:
train_prediction = clf.predict(np.asarray(train[features]))
test_prediction = clf.predict(np.asarray(test[features]))

In [38]:
print("Accuracy for Decision Trees- ")
print("Train set :", accuracy_score(train.TripType,train_prediction))
print("Test set  :", accuracy_score(test.TripType,test_prediction))

Accuracy for Decision Trees- 
Train set : 0.9271482512183331
Test set  : 0.6031356153645153


In [39]:
print(classification_report(test["TripType"], test_prediction))

              precision    recall  f1-score   support

         3.0       0.81      0.94      0.87       738
         4.0       0.00      0.00      0.00        67
         5.0       0.73      0.80      0.76       954
         6.0       0.69      0.65      0.67       282
         7.0       0.61      0.61      0.61      1181
         8.0       0.74      0.84      0.79      2391
         9.0       0.68      0.75      0.71      1894
        12.0       0.05      0.04      0.04        53
        14.0       0.00      0.00      0.00         1
        15.0       0.36      0.34      0.35       174
        18.0       0.34      0.31      0.32       101
        19.0       0.23      0.12      0.16        90
        20.0       0.59      0.48      0.53       126
        21.0       0.50      0.46      0.48       117
        22.0       0.32      0.22      0.26       177
        23.0       0.27      0.25      0.26        24
        24.0       0.48      0.44      0.46       526
        25.0       0.42    

### 5. SVM

In [43]:
#SVM
svm_model = svm.SVC(decision_function_shape='ovr')
svm_model = svm_model.fit(np.asarray(train[features]), np.asarray(train.TripType))



In [44]:
predictions = svm_model.predict(np.asarray(test[features]))

In [46]:
predictions1 = svm_model.predict(np.asarray(train[features]))

In [47]:
print("Accuracy for SVM - ")
print("Train set :", accuracy_score(train.TripType,predictions1))
print("Test set  :", accuracy_score(test.TripType,predictions))

Accuracy for SVM - 
Train set : 0.7113105736944564
Test set  : 0.6839822315129345


In [48]:
print(classification_report(test["TripType"], predictions))

              precision    recall  f1-score   support

         3.0       0.78      0.93      0.85       738
         4.0       0.00      0.00      0.00        67
         5.0       0.69      0.87      0.77       954
         6.0       0.67      0.51      0.58       282
         7.0       0.70      0.67      0.68      1181
         8.0       0.78      0.85      0.81      2391
         9.0       0.65      0.86      0.74      1894
        12.0       0.50      0.02      0.04        53
        14.0       0.00      0.00      0.00         1
        15.0       0.55      0.39      0.45       174
        18.0       0.37      0.42      0.39       101
        19.0       0.38      0.03      0.06        90
        20.0       0.61      0.68      0.64       126
        21.0       0.54      0.64      0.58       117
        22.0       0.48      0.18      0.26       177
        23.0       0.00      0.00      0.00        24
        24.0       0.61      0.53      0.57       526
        25.0       0.71    

  'precision', 'predicted', average, warn_for)


### 6. RF

In [50]:
#Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest = random_forest.fit(np.asarray(train[features]), np.asarray(train.TripType))

In [52]:
test_predictions = random_forest.predict(np.asarray(test[features]))
train_predictions = clf.predict(np.asarray(train[features]))

In [53]:
print("Accuracy for RF - ")
print("Train set :", accuracy_score(train.TripType,train_predictions))
print("Test set  :", accuracy_score(test.TripType,test_predictions))

Accuracy for RF - 
Train set : 0.9271482512183331
Test set  : 0.6618761431931016


In [54]:
print(classification_report(test["TripType"], predictions))

              precision    recall  f1-score   support

         3.0       0.81      0.95      0.87       738
         4.0       0.00      0.00      0.00        67
         5.0       0.74      0.82      0.78       954
         6.0       0.73      0.69      0.71       282
         7.0       0.66      0.65      0.66      1181
         8.0       0.76      0.85      0.80      2391
         9.0       0.69      0.77      0.73      1894
        12.0       0.14      0.02      0.03        53
        14.0       0.00      0.00      0.00         1
        15.0       0.47      0.40      0.43       174
        18.0       0.43      0.31      0.36       101
        19.0       0.35      0.13      0.19        90
        20.0       0.66      0.55      0.60       126
        21.0       0.54      0.51      0.53       117
        22.0       0.44      0.24      0.31       177
        23.0       0.43      0.25      0.32        24
        24.0       0.55      0.55      0.55       526
        25.0       0.56    

  'precision', 'predicted', average, warn_for)
