In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [32]:
data = pd.read_csv('pizza_hut.txt', sep = ';', delimiter=None)

In [33]:
data.count()

DW_CUSTOMER                 100000
FL_HUT_LOVER                100000
DAYPART_COUNT_AFTERNOON      43855
DAYPART_COUNT_DINNER         74447
DAYPART_COUNT_EVENING        24200
                             ...  
LATEST_CNT_OPTION_Sauce      22834
LATEST_CNT_OPTION_Cheese     14045
LATEST_CNT_OPTION_Nfresh     75440
LATEST_CNT_OPTION_Vfresh     49774
LATEST_CNT_OPTION_Others      4238
Length: 319, dtype: int64

In [34]:
data['DW_CUSTOMER'].value_counts()

92409854     1
108559066    1
81452265     1
115136570    1
38211556     1
            ..
102341913    1
104693181    1
50439454     1
41460494     1
106561536    1
Name: DW_CUSTOMER, Length: 100000, dtype: int64

In [35]:
data.drop(columns = 'FL_HUT_LOVER', axis=1, inplace=True)

In [36]:
data.head()

Unnamed: 0,DW_CUSTOMER,DAYPART_COUNT_AFTERNOON,DAYPART_COUNT_DINNER,DAYPART_COUNT_EVENING,DAYPART_COUNT_LUNCH,DAYPART_COUNT_MID_MORNING,DAYPART_COUNT_LATE_NIGHT,DAYPART_COUNT_EARLY_MORNING,AVG_BILL_VALUE_DAYPART_AFTERNOON,AVG_BILL_VALUE_DAYPART_DINNER,...,L6M_CNT_OPTION_Cheese,L6M_CNT_OPTION_Nfresh,L6M_CNT_OPTION_Others,L6M_CNT_OPTION_Sauce,L6M_CNT_OPTION_Vfresh,LATEST_CNT_OPTION_Sauce,LATEST_CNT_OPTION_Cheese,LATEST_CNT_OPTION_Nfresh,LATEST_CNT_OPTION_Vfresh,LATEST_CNT_OPTION_Others
0,4108,,1.0,,,,,,,10.0,...,,,,,1.0,,,,1.0,
1,9668,1.0,,,,,,,24.0,,...,,1.0,,,,,,1.0,,
2,12566,,7.0,,,,,,,12.107143,...,8.0,6.0,,,16.0,,2.0,1.0,4.0,
3,12646,,2.0,,2.0,,,,,11.525,...,,6.0,,,1.0,,,5.0,1.0,
4,13678,,1.0,,,,,,,27.0,...,,1.0,,,2.0,,,1.0,2.0,


In [37]:
data.columns  

Index(['DW_CUSTOMER', 'DAYPART_COUNT_AFTERNOON', 'DAYPART_COUNT_DINNER',
       'DAYPART_COUNT_EVENING', 'DAYPART_COUNT_LUNCH',
       'DAYPART_COUNT_MID_MORNING', 'DAYPART_COUNT_LATE_NIGHT',
       'DAYPART_COUNT_EARLY_MORNING', 'AVG_BILL_VALUE_DAYPART_AFTERNOON',
       'AVG_BILL_VALUE_DAYPART_DINNER',
       ...
       'L6M_CNT_OPTION_Cheese', 'L6M_CNT_OPTION_Nfresh',
       'L6M_CNT_OPTION_Others', 'L6M_CNT_OPTION_Sauce',
       'L6M_CNT_OPTION_Vfresh', 'LATEST_CNT_OPTION_Sauce',
       'LATEST_CNT_OPTION_Cheese', 'LATEST_CNT_OPTION_Nfresh',
       'LATEST_CNT_OPTION_Vfresh', 'LATEST_CNT_OPTION_Others'],
      dtype='object', length=318)

### feature-wise EDA

In [39]:
# data['FL_HUT_LOVER'].value_counts()

In [40]:
data[['DAYPART_COUNT_AFTERNOON',
'DAYPART_COUNT_DINNER',
'DAYPART_COUNT_EVENING',
'DAYPART_COUNT_LUNCH',
'DAYPART_COUNT_MID_MORNING',
'DAYPART_COUNT_LATE_NIGHT',
'DAYPART_COUNT_EARLY_MORNING'
]].head()

Unnamed: 0,DAYPART_COUNT_AFTERNOON,DAYPART_COUNT_DINNER,DAYPART_COUNT_EVENING,DAYPART_COUNT_LUNCH,DAYPART_COUNT_MID_MORNING,DAYPART_COUNT_LATE_NIGHT,DAYPART_COUNT_EARLY_MORNING
0,,1.0,,,,,
1,1.0,,,,,,
2,,7.0,,,,,
3,,2.0,,2.0,,,
4,,1.0,,,,,


In [41]:
data = data[['DAYPART_COUNT_AFTERNOON',
'DAYPART_COUNT_DINNER',
'DAYPART_COUNT_EVENING',
'DAYPART_COUNT_LUNCH',
'DAYPART_COUNT_MID_MORNING',
'DAYPART_COUNT_LATE_NIGHT',
'DAYPART_COUNT_EARLY_MORNING'
]].replace(np.nan,0)

In [20]:
# data['total_order_count'] = data['DAYPART_COUNT_AFTERNOON']+ data['DAYPART_COUNT_DINNER']+data['DAYPART_COUNT_EVENING']+
#                             data['DAYPART_COUNT_LUNCH']+data['DAYPART_COUNT_MID_MORNING']+data['DAYPART_COUNT_LATE_NIGHT']+data[
#                             'DAYPART_COUNT_EARLY_MORNING']

In [42]:
data.isnull().sum()

DAYPART_COUNT_AFTERNOON        0
DAYPART_COUNT_DINNER           0
DAYPART_COUNT_EVENING          0
DAYPART_COUNT_LUNCH            0
DAYPART_COUNT_MID_MORNING      0
DAYPART_COUNT_LATE_NIGHT       0
DAYPART_COUNT_EARLY_MORNING    0
dtype: int64

In [31]:
data.shape

(100000, 259)

## removing features with more than 80% nan values

In [22]:
data=data.drop(columns=['L6M_CNT_OPTION_Cheese',
'L6M_CNT_OPTION_Nfresh',
'L6M_CNT_OPTION_Others',
'L6M_CNT_OPTION_Sauce',
'L6M_CNT_OPTION_Vfresh',
'LATEST_CNT_OPTION_Sauce',
'LATEST_CNT_OPTION_Cheese',
'LATEST_CNT_OPTION_Nfresh',
'LATEST_CNT_OPTION_Vfresh',
'LATEST_CNT_OPTION_Others',
'L6M_CNT_TOPPING__99',
'L6M_CNT_TOPPING_0',
'L6M_CNT_TOPPING_1',
'L6M_CNT_TOPPING_2',
'L6M_CNT_TOPPING_3',
'L6M_CNT_TOPPING_6',
'L6M_CNT_TOPPING_9',
'L6M_CNT_TOPPING_4',
'L6M_CNT_TOPPING_5',
'L6M_CNT_TOPPING_7',
'L6M_CNT_TOPPING_8',
'L6M_CNT_TOPPING_11',
'L6M_CNT_TOPPING_10',
'L6M_CNT_TOPPING_14',
'L6M_CNT_TOPPING_12',
'L6M_CNT_TOPPING_15',
'L6M_CNT_TOPPING_13',
'L6M_CNT_TOPPING_16',
'L6M_CNT_TOPPING_18',
'L6M_CNT_TOPPING_17',
'L6M_CNT_TOPPING_42',
'L6M_CNT_TOPPING_19',
'L6M_CNT_TOPPING_24',
'L6M_CNT_TOPPING_20',
'L6M_CNT_TOPPING_30',
'L6M_CNT_TOPPING_21',
'LATEST_CNT_TOPPING__99',
'LATEST_CNT_TOPPING_0',
'LATEST_CNT_TOPPING_1',
'LATEST_CNT_TOPPING_2',
'LATEST_CNT_TOPPING_3',
'LATEST_CNT_TOPPING_4',
'LATEST_CNT_TOPPING_6',
'LATEST_CNT_TOPPING_5',
'LATEST_CNT_TOPPING_7',
'LATEST_CNT_TOPPING_8',
'LATEST_CNT_TOPPING_9',
'LATEST_CNT_TOPPING_11',
'LATEST_CNT_TOPPING_10',
'LATEST_CNT_TOPPING_15',
'LATEST_CNT_TOPPING_12',
'LATEST_CNT_TOPPING_16',
'LATEST_CNT_TOPPING_14',
'LATEST_CNT_TOPPING_13',
'LATEST_CNT_TOPPING_17',
'LATEST_CNT_TOPPING_20',
'LATEST_CNT_TOPPING_24',
'LATEST_CNT_TOPPING_18',
'LATEST_CNT_TOPPING_20',
'FL_WINGS_APRIL'])

In [23]:
data.shape

(100000, 259)

In [24]:
thresh = len(data) * .2
df = data.dropna(thresh = thresh, axis = 1)

In [25]:
df.shape

(100000, 129)

In [26]:
features = df.columns

In [27]:
features

Index(['DW_CUSTOMER', 'DAYPART_COUNT_AFTERNOON', 'DAYPART_COUNT_DINNER',
       'DAYPART_COUNT_EVENING', 'DAYPART_COUNT_LUNCH',
       'AVG_BILL_VALUE_DAYPART_AFTERNOON', 'AVG_BILL_VALUE_DAYPART_DINNER',
       'AVG_BILL_VALUE_DAYPART_EVENING', 'AVG_BILL_VALUE_DAYPART_LUNCH',
       'L6M_CNT_VISIT_OCCSN_Carry_Out',
       ...
       'LATEST_CNT_COUPON_Pizza', 'LATEST_CNT_COUPON_Wings',
       'LATEST_CNT_COUPON_AppDess', 'CNT_ORDER_MAY_Pizza',
       'SPN_ORDER_MAY_Pizza', 'FL_PIZZA_MAY', 'TOT_SPN_WS_STORES_L6M',
       'FL_VISIT_WS_STORE_LATEST', 'SPN_WINGS_WS_STORES_L6M', 'WingStreet'],
      dtype='object', length=129)

In [28]:
df.isnull().sum()

DW_CUSTOMER                     0
DAYPART_COUNT_AFTERNOON     56145
DAYPART_COUNT_DINNER        25553
DAYPART_COUNT_EVENING       75800
DAYPART_COUNT_LUNCH         73094
                            ...  
FL_PIZZA_MAY                62220
TOT_SPN_WS_STORES_L6M           0
FL_VISIT_WS_STORE_LATEST        0
SPN_WINGS_WS_STORES_L6M     65118
WingStreet                  65118
Length: 129, dtype: int64

In [30]:
data['TOTCOUPONSALES_L6M'].value_counts()

0.00     24205
4.00      2798
5.00      2782
6.00      2224
3.00      1864
         ...  
20.37        1
83.77        1
68.59        1
18.33        1
63.58        1
Name: TOTCOUPONSALES_L6M, Length: 6438, dtype: int64

In [None]:
#bill value

billcolumns = ['AVG_BILL_VALUE_DAYPART_AFTERNOON','AVG_BILL_VALUE_DAYPART_DINNER','AVG_BILL_VALUE_DAYPART_EVENING',
'AVG_BILL_VALUE_DAYPART_LUNCH','AVG_BILL_VALUE_DAYPART_MID_MORNI','AVG_BILL_VALUE_DAYPART_LATE_NIGH',
'AVG_BILL_VALUE_DAYPART_EARLY_MOR']

In [127]:
# let's take a sample from above dataset

sample =  df.sample(frac=.25,random_state = 7)

In [128]:
sample.shape

(25000, 130)

In [129]:
sample.isna().sum()

DW_CUSTOMER                     0
FL_HUT_LOVER                    0
DAYPART_COUNT_AFTERNOON     13947
DAYPART_COUNT_DINNER         6483
DAYPART_COUNT_EVENING       18883
                            ...  
FL_PIZZA_MAY                15567
TOT_SPN_WS_STORES_L6M           0
FL_VISIT_WS_STORE_LATEST        0
SPN_WINGS_WS_STORES_L6M     16244
WingStreet                  16244
Length: 130, dtype: int64

### ***imputing the values using knn imputer
    

In [82]:
# sample_floatdata = sample.loc[:, sample.dtypes == np.float64]
# sample_floatdata.shape

In [80]:
# import sys
# from impyute.imputation.cs import fast_knn
# sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
# # start the KNN training
# imputed_training=fast_knn(sample_floatdata.values, k=30)

### imputing null values with 0

In [130]:
for i in sample.columns:
    sample=sample.replace(np.nan,0)

In [131]:
sample.isnull().sum()

DW_CUSTOMER                 0
FL_HUT_LOVER                0
DAYPART_COUNT_AFTERNOON     0
DAYPART_COUNT_DINNER        0
DAYPART_COUNT_EVENING       0
                           ..
FL_PIZZA_MAY                0
TOT_SPN_WS_STORES_L6M       0
FL_VISIT_WS_STORE_LATEST    0
SPN_WINGS_WS_STORES_L6M     0
WingStreet                  0
Length: 130, dtype: int64

In [132]:
sample.sample(10)

Unnamed: 0,DW_CUSTOMER,FL_HUT_LOVER,DAYPART_COUNT_AFTERNOON,DAYPART_COUNT_DINNER,DAYPART_COUNT_EVENING,DAYPART_COUNT_LUNCH,AVG_BILL_VALUE_DAYPART_AFTERNOON,AVG_BILL_VALUE_DAYPART_DINNER,AVG_BILL_VALUE_DAYPART_EVENING,AVG_BILL_VALUE_DAYPART_LUNCH,...,LATEST_CNT_COUPON_Pizza,LATEST_CNT_COUPON_Wings,LATEST_CNT_COUPON_AppDess,CNT_ORDER_MAY_Pizza,SPN_ORDER_MAY_Pizza,FL_PIZZA_MAY,TOT_SPN_WS_STORES_L6M,FL_VISIT_WS_STORE_LATEST,SPN_WINGS_WS_STORES_L6M,WingStreet
16526,33755899,1,1.0,1.0,0.0,2.0,16.0,38.99,0.0,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,251.98,0,12.0,0.0
91187,118837858,1,1.0,0.0,1.0,0.0,22.99,0.0,16.99,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,42.73,1,6.99,1.0
34985,62610832,1,2.0,1.0,0.0,0.0,7.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0,0.0,0.0
61472,97600418,1,0.0,7.0,4.0,0.0,0.0,12.428571,11.0625,0.0,...,0.0,0.0,1.0,2.0,22.0,1.0,131.25,0,8.0,0.0
32678,59888559,1,2.0,1.0,0.0,0.0,19.5,20.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,59.0,0,11.0,0.0
6504,11511300,1,0.0,2.0,0.0,0.0,0.0,17.5,0.0,0.0,...,0.0,0.0,0.0,1.0,11.0,1.0,35.0,0,0.0,0.0
70581,104836082,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11.99,...,1.0,0.0,0.0,2.0,19.0,1.0,11.99,1,0.0,0.0
92920,119965893,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0,0.0,0.0
26480,50262522,1,0.0,2.0,0.0,1.0,0.0,20.25,0.0,25.3,...,0.0,0.0,1.0,0.0,0.0,0.0,65.8,1,8.8,1.0
47937,84507247,1,0.0,2.0,1.0,2.0,0.0,15.5,19.98,22.635,...,2.0,0.0,0.0,0.0,0.0,0.0,89.96,0,6.99,0.0


In [133]:
sample['FL_WINGS_APRIL'].value_counts()

KeyError: 'FL_WINGS_APRIL'

In [None]:
sample.shape

## applying rf model

In [None]:
from sklearn.utils import shuffle
sampled_data=shuffle(sampled_data)

In [134]:
X=sample.drop(columns=["FL_WINGS_MAY"])
Y=sample["FL_WINGS_MAY"]

KeyError: "['FL_WINGS_MAY'] not found in axis"

In [None]:
from sklearn.model_selection import train_test_split
train_x,valid_x,train_y,valid_y=train_test_split(X,Y,test_size=0.25,random_state=50,stratify=Y)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF
clf=RF(max_depth=3,n_estimators=1000)
clf.fit(train_x,train_y)
rf_res=clf.predict(valid_x)

In [None]:
print(classification_report(valid_y,rf_res))

In [None]:
clf.feature_importances_

In [None]:
feature_importance=pd.DataFrame(clf.feature_importances_,index=train_x.columns,
                                columns=['importance']).sort_values('importance',ascending=False)

In [None]:
feature_importance["importance"].nlargest(60)

In [None]:
data1 = data

In [None]:
### Get all the features columns except the class
features = list(data.columns)

### Get the features data
fdata = data[features]

In [None]:
### Run PCA on the data and reduce the dimensions in pca_num_components dimensions
reduced_data = PCA(n_components=pca_num_components).fit_transform(fdata)
results = pd.DataFrame(reduced_data,columns=['pca1','pca2'])

sns.scatterplot(x="pca1", y="pca2", hue=data['clusters'], data=results)
plt.title('K-means Clustering with 2 dimensions')
plt.show()