# Essentials

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/marketing-strategy-personalised-offer/sample.csv
/kaggle/input/marketing-strategy-personalised-offer/train_data.csv
/kaggle/input/marketing-strategy-personalised-offer/test_data.csv


In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

In [3]:
raw_train = pd.read_csv("/kaggle/input/marketing-strategy-personalised-offer/train_data.csv")
raw_test = pd.read_csv("/kaggle/input/marketing-strategy-personalised-offer/test_data.csv")

## Data statistics

In [4]:
raw_train.shape, raw_test.shape, raw_train.dtypes.sort_values().value_counts()

((12379, 31),
 (5305, 30),
 object    18
 int64     13
 dtype: int64)

In [5]:
raw_train.isna().sum().sort_values(ascending=False)

car                                     12268
no_visited_Cold drinks                    198
Restaur_spend_greater_than20              160
no_Take-aways                             144
Restaur_spend_less_than20                 121
no_visited_bars                            93
drop location                               0
Climate                                     0
Travel Time                                 0
is foodie                                   0
temperature                                 0
visit restaurant with rating (avg)          0
has Children                                0
Prefer home food                            0
restuarant_opposite_direction_house         0
Job/Job Industry                            0
offer expiration                            0
Cooks regularly                             0
Qualification                               0
Customer type                               0
income_range                                0
restuarant_same_direction_house   

In [6]:
x_train = raw_train.copy()
y_train = x_train["Offer Accepted"].copy()
x_test = raw_test.copy()

x_train.drop('Offer Accepted', axis=1, inplace=True)
x_train.drop('car', axis=1, inplace=True)
x_test.drop('car', axis=1, inplace=True)

x_train.dtypes.sort_values()

Cooks regularly                          int64
Travel Time                              int64
temperature                              int64
visit restaurant with rating (avg)       int64
has Children                             int64
restuarant_opposite_direction_house      int64
is foodie                                int64
restuarant_same_direction_house          int64
travelled_more_than_5mins_for_offer      int64
Prefer home food                         int64
Prefer western over chinese              int64
travelled_more_than_15mins_for_offer     int64
travelled_more_than_25mins_for_offer     int64
Climate                                 object
income_range                            object
Restaur_spend_greater_than20            object
no_visited_Cold drinks                  object
Restaur_spend_less_than20               object
Marital Status                          object
Job/Job Industry                        object
restaurant type                         object
Qualification

In [7]:
y_train.unique()

array(['No', 'Yes'], dtype=object)

**Problem is of binary classification type - (Yes / No)**

In [8]:
x_train.isna().sum().sort_values(ascending=False)

no_visited_Cold drinks                  198
Restaur_spend_greater_than20            160
no_Take-aways                           144
Restaur_spend_less_than20               121
no_visited_bars                          93
offer expiration                          0
Qualification                             0
drop location                             0
Climate                                   0
Travel Time                               0
temperature                               0
visit restaurant with rating (avg)        0
has Children                              0
restuarant_opposite_direction_house       0
Job/Job Industry                          0
is foodie                                 0
Cooks regularly                           0
Customer type                             0
income_range                              0
restuarant_same_direction_house           0
gender                                    0
travelled_more_than_5mins_for_offer       0
travelled_more_than_25mins_for_o

* Features and labels have been split
* Features have categorical and numerical values
* Missing values are only in the categorical features
    * Since feature 'car' has the missing values comparable to data, thus can be dropped

In [9]:
x_train.dtypes.value_counts()

object    16
int64     13
dtype: int64

# Phase 1
# Data preprocessing

**Data wrangling**
* Feature imputation
* Categorical and numerical transformers
* Feature scaling

In [10]:
#before = x_train.shape

# shorthand code - drop columns that only contain one value
x_train.drop([i for i in x_train.columns if len(x_train[i].unique())==1], axis=1, inplace=True)

# repeat for test data
x_test.drop([i for i in x_test.columns if len(x_test[i].unique())==1], axis=1, inplace=True)

#after = x_train.shape
#before, after

In [11]:
# list numerical and categorical features

numerical, categorical = [], []

for i in x_train.columns:
    x = x_train[i].dtype
    if (x=='int64'):
        numerical.append(i)
    elif (x=='O'):
        categorical.append(i)
        
# categorical, numerical

## Feature imputation

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
si = SimpleImputer(strategy="most_frequent")

# separate dataframes for categorical and numerical features
numdf_tr = x_train[numerical]
catdf_tr = x_train[categorical]

# impute missing values in categorical features and concatnate with numerical
catdf_tr = pd.DataFrame(si.fit_transform(catdf_tr), columns=catdf_tr.columns)
x_train = pd.concat([catdf_tr, numdf_tr], axis=1)

# repeat for test data
numdf_te = x_test[numerical]
catdf_te = x_test[categorical]

catdf_te = pd.DataFrame(si.fit_transform(catdf_te), columns=catdf_tr.columns) # putting catdf_tr columns as a check
x_test = pd.concat([catdf_te, numdf_te], axis=1)

x_train.head()

Unnamed: 0,offer expiration,income_range,no_visited_Cold drinks,Restaur_spend_less_than20,Marital Status,restaurant type,age,no_visited_bars,gender,Customer type,...,travelled_more_than_25mins_for_offer,restuarant_same_direction_house,Cooks regularly,is foodie,restuarant_opposite_direction_house,has Children,visit restaurant with rating (avg),temperature,Travel Time,Prefer home food
0,2days,₹100000 or More,4~8,less1,Married partner,4 star restaurant,36,less1,Female,Individual,...,0,0,1,0,0,0,4,67,22,0
1,2days,₹87500 - ₹99999,4~8,4~8,Married partner,Take-away restaurant,50plus,never,Female,Individual,...,0,1,1,0,0,1,3,89,18,0
2,2days,₹87500 - ₹99999,less1,1~3,Single,Cold drinks,26,never,Female,Individual,...,0,1,0,1,1,1,4,67,7,1
3,10hours,₹37500 - ₹49999,less1,1~3,Single,Take-away restaurant,46,never,Male,Individual,...,0,1,0,1,0,1,3,89,7,0
4,2days,₹100000 or More,never,1~3,Single,4 star restaurant,21,less1,Female,Individual,...,1,0,0,1,1,0,3,40,7,0


In [14]:
x_train.dtypes.value_counts()

object    16
int64     12
dtype: int64

**Features imputation done, now we can apply encoders**
**Categorical features should be divided into ordinal and nominal and encoders should be applied accordingly**

In [15]:
unique = {}
for i in sorted(categorical):
    unique[i] = x_train[i].unique()
    
unique = pd.DataFrame(unique.items(), columns=['features', 'unique values'])
unique

Unnamed: 0,features,unique values
0,Climate,"[Spring, Summer, Winter]"
1,Customer type,"[Individual, With Family, With Kids, With Coll..."
2,Job/Job Industry,"[Unemployed, Arts Design Entertainment Sports ..."
3,Marital Status,"[Married partner, Single, Divorced, Unmarried ..."
4,Qualification,"[Bachelors degree, Some college - no degree, G..."
5,Restaur_spend_greater_than20,"[less1, 1~3, never, gt8, 4~8]"
6,Restaur_spend_less_than20,"[less1, 4~8, 1~3, gt8, never]"
7,age,"[36, 50plus, 26, 46, 21, below21, 41, 31]"
8,drop location,"[Location B, Location A, Location C]"
9,gender,"[Female, Male]"


In [16]:
x_train['age'].value_counts()

21         2602
26         2543
31         1929
50plus     1756
36         1259
41         1060
46          716
below21     514
Name: age, dtype: int64

**We can group values in ['age'] into bins of below21, 21-50 and 50plus**

In [17]:
# create a series copy of age column
age_tr = x_train['age'].copy()

# iterate through to find values between 21 and 50
# since age is in string format, we search by eliminating

for i in range(len(age_tr)):
    x = age_tr.loc[i]
    if (x=='50plus' or x=='below21'):
        pass
    else:
        age_tr = age_tr.replace(x, '21to50')

# drop original age column and add transformed age searies column
x_train.drop('age', axis=1, inplace=True)
x_train['age'] = age_tr
x_train['age'].value_counts()

21to50     10109
50plus      1756
below21      514
Name: age, dtype: int64

In [18]:
# repeat for test data

age_te = x_test['age'].copy()

for i in range(len(age_te)):
    x = age_te.loc[i]
    if (x=='50plus' or x=='below21'):
        pass
    else:
        age_te = age_te.replace(x, '21to50')

x_test.drop('age', axis=1, inplace=True)
x_test['age'] = age_te
x_test['age'].value_counts()

21to50     4288
50plus      772
below21     245
Name: age, dtype: int64

In [19]:
# checking unique values to separate ordinal and nominal features

unique

Unnamed: 0,features,unique values
0,Climate,"[Spring, Summer, Winter]"
1,Customer type,"[Individual, With Family, With Kids, With Coll..."
2,Job/Job Industry,"[Unemployed, Arts Design Entertainment Sports ..."
3,Marital Status,"[Married partner, Single, Divorced, Unmarried ..."
4,Qualification,"[Bachelors degree, Some college - no degree, G..."
5,Restaur_spend_greater_than20,"[less1, 1~3, never, gt8, 4~8]"
6,Restaur_spend_less_than20,"[less1, 4~8, 1~3, gt8, never]"
7,age,"[36, 50plus, 26, 46, 21, below21, 41, 31]"
8,drop location,"[Location B, Location A, Location C]"
9,gender,"[Female, Male]"


**From above, we separate ordinal features for OrdinalEncoder and nominal features for OneHotEncoder**

    Indices for 
        Ordinal = 4, 5, 6, 7, 10, 11, 12, 13, 14

In [20]:
ordinal = [(sorted(categorical))[i] for i in [4, 5, 6, 7, 10, 11, 12, 13, 14]]
nominal = [i for i in categorical if i not in ordinal]

print(len(ordinal), len(nominal), ordinal)

9 7 ['Qualification', 'Restaur_spend_greater_than20', 'Restaur_spend_less_than20', 'age', 'income_range', 'no_Take-aways', 'no_visited_Cold drinks', 'no_visited_bars', 'offer expiration']


In [21]:
count = 0
for i in nominal:
    count += len(x_train[i].unique())

print("unique values in nominal features =", count)

unique values in nominal features = 47


**47 unique columns from nominal features and 9 from ordinal features should finally give us 56 columns for categorical features**

In [22]:
unique = {}
for i in ordinal:
    unique[i] = x_train[i].unique()
pd.DataFrame(unique.items(), columns=['features', 'unique values'])

Unnamed: 0,features,unique values
0,Qualification,"[Bachelors degree, Some college - no degree, G..."
1,Restaur_spend_greater_than20,"[less1, 1~3, never, gt8, 4~8]"
2,Restaur_spend_less_than20,"[less1, 4~8, 1~3, gt8, never]"
3,age,"[21to50, 50plus, below21]"
4,income_range,"[₹100000 or More, ₹87500 - ₹99999, ₹37500 - ₹4..."
5,no_Take-aways,"[1~3, gt8, 4~8, less1, never]"
6,no_visited_Cold drinks,"[4~8, less1, never, 1~3, gt8]"
7,no_visited_bars,"[less1, never, 1~3, 4~8, gt8]"
8,offer expiration,"[2days, 10hours]"


**Specifying categories for ordinal features for OrdinalEncoding**

In [23]:
category = {
    'income_range': ['Less than ₹12500', '₹12500 - ₹24999', '₹25000 - ₹37499', '₹37500 - ₹49999', '₹50000 - ₹62499', '₹62500 - ₹74999', '₹75000 - ₹87499', '₹87500 - ₹99999', '₹100000 or More'],
    'Restaur_spend_greater_than20': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'no_visited_Cold drinks': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'Restaur_spend_less_than20': ['never', 'less1', '1~3', '4~8', 'gt8'], 
    'age': ['below21', '21to50', '50plus'],
    'no_visited_bars': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'no_Take-aways' : ['never', 'less1', '1~3', '4~8', 'gt8'],
    'Qualification' : ['Some High School', 'High School Graduate', 'Associates degree', 'Some college - no degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
    'offer expiration': ['2days', '10hours']
}

## ColumnTransformer

**Combining categorical encoders and numerical scalers in ColumnTransformer**

In [24]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

In [25]:
# checking if total columns match before we add new ones
len(x_train.columns) == len(ordinal) + len(nominal) + len(numerical)

True

In [26]:
feature = list(category.keys())
cols = list(category.values())

ct = ColumnTransformer([
    ("ordenc", OrdinalEncoder(categories=[i for i in cols]), [j for j in feature]),
    ("onehotenc", OneHotEncoder(), [j for j in nominal]),
    ("minmax", MinMaxScaler(), [j for j in numerical])
], sparse_threshold=0)

# instead of sparse_threshold, alt we can use sparse=False in OneHotEncoder

x_train = pd.DataFrame(ct.fit_transform(x_train))
x_test = pd.DataFrame(ct.fit_transform(x_test))
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
0,8.0,1.0,3.0,1.0,1.0,1.0,2.0,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.55102,1.0,0.0
1,7.0,2.0,3.0,3.0,2.0,0.0,2.0,4.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.75,1.0,0.733333,0.0
2,7.0,1.0,1.0,2.0,1.0,0.0,2.0,4.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.55102,0.0,1.0
3,3.0,2.0,1.0,2.0,1.0,0.0,2.0,3.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.75,1.0,0.0,0.0
4,8.0,1.0,0.0,2.0,1.0,1.0,4.0,3.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.75,0.0,0.0,0.0


In [27]:
y_train = pd.Series(LabelEncoder().fit_transform(y_train))
y_train.value_counts()

1    6994
0    5385
dtype: int64

## Summary

    Loaded data
    Separated features and labels
    Dropped columns like empty, single valued, etc.
    Imputed missing values (only in categorical in this data)
    Listed ordinal, nominal and numerical features
    Applied OrdinalEncoder and OneHotEncoder for ordinal and nominal features respectively
    Scaled Numerical features using MinMaxScaler (alt. StandardScaler)
    Encoded labels using LabelEncoder
    Simultaneously pre-processed train and test data to avoid errors

# Phase 2
# Model selection