# Essentials

In [130]:
import time, datetime
start = time.time()

In [131]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/marketing-strategy-personalised-offer/sample.csv
/kaggle/input/marketing-strategy-personalised-offer/train_data.csv
/kaggle/input/marketing-strategy-personalised-offer/test_data.csv


In [132]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

In [133]:
raw_train = pd.read_csv("/kaggle/input/marketing-strategy-personalised-offer/train_data.csv")
raw_test = pd.read_csv("/kaggle/input/marketing-strategy-personalised-offer/test_data.csv")

## Data statistics

In [134]:
raw_train.shape, raw_test.shape, raw_train.dtypes.sort_values().value_counts()

((12379, 31),
 (5305, 30),
 object    18
 int64     13
 dtype: int64)

In [135]:
raw_train.isna().sum().sort_values(ascending=False)

car                                     12268
no_visited_Cold drinks                    198
Restaur_spend_greater_than20              160
no_Take-aways                             144
Restaur_spend_less_than20                 121
no_visited_bars                            93
drop location                               0
Climate                                     0
Travel Time                                 0
is foodie                                   0
temperature                                 0
visit restaurant with rating (avg)          0
has Children                                0
Prefer home food                            0
restuarant_opposite_direction_house         0
Job/Job Industry                            0
offer expiration                            0
Cooks regularly                             0
Qualification                               0
Customer type                               0
income_range                                0
restuarant_same_direction_house   

In [136]:
features = raw_train.copy()
test = raw_test.copy()

labels = features["Offer Accepted"].copy()
features.drop('Offer Accepted', axis=1, inplace=True)

features.drop('car', axis=1, inplace=True)
test.drop('car', axis=1, inplace=True)

#features.dtypes.sort_values()

In [137]:
labels.unique()

array(['No', 'Yes'], dtype=object)

**Problem is of binary classification type - (Yes / No)**

In [138]:
#features.isna().sum().sort_values(ascending=False)

* Features and labels have been split
* Features have categorical and numerical values
* Missing values are only in the categorical features
    * Since feature 'car' has the missing values comparable to data, thus can be dropped

### Phase 1
# Data preprocessing

**Data wrangling**
* Feature imputation
* Categorical and numerical transformers
* Feature scaling

In [139]:
#before = x.shape

# shorthand code - drop columns that only contain one value
features.drop([i for i in features.columns if len(features[i].unique())==1], axis=1, inplace=True)

# repeat for test data
test.drop([i for i in test.columns if len(test[i].unique())==1], axis=1, inplace=True)

#after = x.shape
#before, after

In [140]:
# list numerical and categorical features

numerical, categorical = [], []

for i in features.columns:
    n = features[i].dtype
    if (n=='int64'):
        numerical.append(i)
    elif (n=='O'):
        categorical.append(i)
        
len(categorical)+len(numerical)==len(features.columns)

True

## Feature imputation

In [141]:
from sklearn.impute import SimpleImputer

In [142]:
si = SimpleImputer(strategy="most_frequent")

# separate dataframes for categorical and numerical features
numdf_tr = features[numerical]
catdf_tr = features[categorical]

# impute missing values in categorical features and concatnate with numerical
catdf_tr = pd.DataFrame(si.fit_transform(catdf_tr), columns=catdf_tr.columns)
features = pd.concat([catdf_tr, numdf_tr], axis=1)

# repeat for test data
# seaprate
numdf_te = test[numerical]
catdf_te = test[categorical]
# impute
catdf_te = pd.DataFrame(si.fit_transform(catdf_te), columns=catdf_tr.columns) # putting catdf_tr columns as a check
test = pd.concat([catdf_te, numdf_te], axis=1)

features.head(2)

Unnamed: 0,offer expiration,income_range,no_visited_Cold drinks,Restaur_spend_less_than20,Marital Status,restaurant type,age,no_visited_bars,gender,Customer type,...,travelled_more_than_25mins_for_offer,restuarant_same_direction_house,Cooks regularly,is foodie,restuarant_opposite_direction_house,has Children,visit restaurant with rating (avg),temperature,Travel Time,Prefer home food
0,2days,₹100000 or More,4~8,less1,Married partner,4 star restaurant,36,less1,Female,Individual,...,0,0,1,0,0,0,4,67,22,0
1,2days,₹87500 - ₹99999,4~8,4~8,Married partner,Take-away restaurant,50plus,never,Female,Individual,...,0,1,1,0,0,1,3,89,18,0


**Features imputation done, now we can apply encoders**

**Categorical features should be divided into ordinal and nominal and encoders should be applied accordingly**

In [143]:
unique = {}
for i in sorted(categorical):
    unique[i] = features[i].unique()
    
unique = pd.DataFrame(unique.items(), columns=['features', 'unique values'])
unique

Unnamed: 0,features,unique values
0,Climate,"[Spring, Summer, Winter]"
1,Customer type,"[Individual, With Family, With Kids, With Coll..."
2,Job/Job Industry,"[Unemployed, Arts Design Entertainment Sports ..."
3,Marital Status,"[Married partner, Single, Divorced, Unmarried ..."
4,Qualification,"[Bachelors degree, Some college - no degree, G..."
5,Restaur_spend_greater_than20,"[less1, 1~3, never, gt8, 4~8]"
6,Restaur_spend_less_than20,"[less1, 4~8, 1~3, gt8, never]"
7,age,"[36, 50plus, 26, 46, 21, below21, 41, 31]"
8,drop location,"[Location B, Location A, Location C]"
9,gender,"[Female, Male]"


In [144]:
features['age'].value_counts()

21         2602
26         2543
31         1929
50plus     1756
36         1259
41         1060
46          716
below21     514
Name: age, dtype: int64

**We can group values in ['age'] into bins of below21, 21-50 and 50plus**

In [145]:
# create a series copy of age column
age_tr = features['age'].copy()

# iterate through to find values between 21 and 50
# since age is in string format, we search by eliminating

for i in range(len(age_tr)):
    n = age_tr.loc[i]
    if (n=='50plus' or n=='below21'):
        pass
    else:
        age_tr = age_tr.replace(n, '21to50')

# drop original age column and add transformed age searies column
features.drop('age', axis=1, inplace=True)
features['age'] = age_tr
features['age'].value_counts()

21to50     10109
50plus      1756
below21      514
Name: age, dtype: int64

In [146]:
# repeat for test data

age_te = test['age'].copy()

for i in range(len(age_te)):
    n = age_te.loc[i]
    if (n=='50plus' or n=='below21'):
        pass
    else:
        age_te = age_te.replace(n, '21to50')

test.drop('age', axis=1, inplace=True)
test['age'] = age_te
test['age'].value_counts()

21to50     4288
50plus      772
below21     245
Name: age, dtype: int64

In [147]:
# checking unique values to separate ordinal and nominal features

#unique

**From above, we separate ordinal features for OrdinalEncoder and nominal features for OneHotEncoder**

    Indices for 
        Ordinal = 4, 5, 6, 7, 10, 11, 12, 13, 14

In [148]:
ordinal = [(sorted(categorical))[i] for i in [4, 5, 6, 7, 10, 11, 12, 13, 14]]
nominal = [i for i in categorical if i not in ordinal]

print(len(ordinal), len(nominal), "\n", ordinal)

9 7 
 ['Qualification', 'Restaur_spend_greater_than20', 'Restaur_spend_less_than20', 'age', 'income_range', 'no_Take-aways', 'no_visited_Cold drinks', 'no_visited_bars', 'offer expiration']


In [149]:
count = 0
for i in nominal:
    count += len(features[i].unique())

print("unique values in nominal features =", count)

unique values in nominal features = 47


**47 unique columns from nominal features and 9 from ordinal features should finally give us 56 columns for categorical features**

In [150]:
unique = {}
for i in ordinal:
    unique[i] = features[i].unique()
pd.DataFrame(unique.items(), columns=['features', 'unique values'])

Unnamed: 0,features,unique values
0,Qualification,"[Bachelors degree, Some college - no degree, G..."
1,Restaur_spend_greater_than20,"[less1, 1~3, never, gt8, 4~8]"
2,Restaur_spend_less_than20,"[less1, 4~8, 1~3, gt8, never]"
3,age,"[21to50, 50plus, below21]"
4,income_range,"[₹100000 or More, ₹87500 - ₹99999, ₹37500 - ₹4..."
5,no_Take-aways,"[1~3, gt8, 4~8, less1, never]"
6,no_visited_Cold drinks,"[4~8, less1, never, 1~3, gt8]"
7,no_visited_bars,"[less1, never, 1~3, 4~8, gt8]"
8,offer expiration,"[2days, 10hours]"


**Specifying categories for ordinal features for OrdinalEncoding**

In [151]:
category = {
    'income_range': ['Less than ₹12500', '₹12500 - ₹24999', '₹25000 - ₹37499', '₹37500 - ₹49999', '₹50000 - ₹62499', '₹62500 - ₹74999', '₹75000 - ₹87499', '₹87500 - ₹99999', '₹100000 or More'],
    'Restaur_spend_greater_than20': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'no_visited_Cold drinks': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'Restaur_spend_less_than20': ['never', 'less1', '1~3', '4~8', 'gt8'], 
    'age': ['below21', '21to50', '50plus'],
    'no_visited_bars': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'no_Take-aways' : ['never', 'less1', '1~3', '4~8', 'gt8'],
    'Qualification' : ['Some High School', 'High School Graduate', 'Associates degree', 'Some college - no degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
    'offer expiration': ['2days', '10hours']
}

## ColumnTransformer

**Combining categorical encoders and numerical scalers in ColumnTransformer**

In [152]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

In [153]:
# checking if total columns match before we add new ones
len(features.columns) == len(ordinal) + len(nominal) + len(numerical)

True

In [154]:
feature = list(category.keys())
cols = list(category.values())

ct = ColumnTransformer([
    ("ordenc", OrdinalEncoder(categories=[i for i in cols]), [j for j in feature]),
    ("onehotenc", OneHotEncoder(), [j for j in nominal]),
    ("minmax", MinMaxScaler(), [j for j in numerical])
], sparse_threshold=0)

# instead of sparse_threshold, alt we can use sparse=False in OneHotEncoder

features = pd.DataFrame(ct.fit_transform(features))
features.columns = [str(i) for i in range(1, features.shape[1]+1)]

test = pd.DataFrame(ct.fit_transform(test))
test.columns = [str(i) for i in range(1, test.shape[1]+1)]
features.head(1)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,59,60,61,62,63,64,65,66,67,68
0,8.0,1.0,3.0,1.0,1.0,1.0,2.0,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.55102,1.0,0.0


In [155]:
labelsf = pd.Series(LabelEncoder().fit_transform(labels))
labelsf.value_counts()

1    6994
0    5385
dtype: int64

## Summary

    Loaded data
    Separated features and labels
    Dropped columns like empty, single valued, etc.
    Imputed missing values (only in categorical in this data)
    Listed ordinal, nominal and numerical features
    Applied OrdinalEncoder and OneHotEncoder for ordinal and nominal features respectively
    Scaled Numerical features using MinMaxScaler (alt. StandardScaler)
    Encoded labels using LabelEncoder
    Simultaneously pre-processed train and test data to avoid errors

### Phase 2
# Dimensionality reduction

In [156]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, GenericUnivariateSelect, mutual_info_regression, chi2
from sklearn.decomposition import PCA

In [157]:
features.var().sort_values(ascending=False).head(10)

1     6.484221
3     1.511727
6     1.203212
8     1.196147
7     0.849660
4     0.841224
2     0.772246
61    0.250020
62    0.250013
58    0.249976
dtype: float64

In [158]:
pca = PCA(n_components=7)

pca_features = pca.fit_transform(features)
pca_test = pca.fit_transform(test)

### Phase 3

# Model selection

In [159]:
# selecting data for training

trainf = features.copy()
testf = test.copy()

In [160]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_validate, cross_val_score, ShuffleSplit
from sklearn.metrics import f1_score

In [161]:
x, X_test, y, y_test = train_test_split(trainf ,labelsf , test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(x ,y , test_size=0.2, random_state=0)

In [162]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

In [163]:
def get_score(model):
    print("val:",round(f1_score(y_val, model.predict(X_val)),4), "\ntest:", round(f1_score(y_test, model.predict(X_test)),4),
          "\ncross_val", round((cross_val_score(model, trainf, labelsf, cv=cv).mean()),4))

def make_submission(clf, path):
    pred = pd.DataFrame(clf.predict(test)).replace({1:"Yes", 0:"No"})

    submission = pd.concat([pd.DataFrame(list(range(len(pred)))), pred], axis=1)
    submission.columns = ['id', 'Offer Accepted']

    submission.to_csv(path+"submissions/submission.csv", index=None)

## Baseline Model

In [164]:
from sklearn.dummy import DummyClassifier

In [165]:
# clf = DummyClassifier()
# clf.fit(X_train, y_train)

# get_score(clf), pd.Series(clf.predict(X_test)).unique() # custom get_score function created above

## SGD Classifier

In [166]:
from sklearn.linear_model import SGDClassifier

In [167]:
# clf = SGDClassifier()
# clf.fit(X_train, y_train)

# get_score(clf) # custom get_score function created above

## Logistic regression

In [168]:
from sklearn.linear_model import LogisticRegression

In [169]:
# penalties = ['elasticnet', 'l1', 'l2', 'none']

# for i in penalties:
#     if (i=='elasticnet'):
#         log_reg = LogisticRegression(solver='saga', penalty=i, l1_ratio=0.5)
#     else:
#         log_reg = LogisticRegression(solver='saga', penalty=i)
    
#     log_reg.fit(X_train, y_train)
#     print(i, get_score(log_reg), "\n")

## KNN

In [170]:
from sklearn.neighbors import KNeighborsClassifier

In [171]:
# params = {
#     "n_neighbors":list(range(1,20)), 'weights':["uniform", "distance"], 'metric' : ['minkowski','euclidean','manhattan']
# }

# grid = GridSearchCV(KNeighborsClassifier(), param_grid=params, scoring="f1", n_jobs=-1, cv=cv, error_score=0)
# grid.fit(X_train, y_train)

# print( grid.best_score_, grid.best_params_)

# clf = grid.best_estimator_
# clf.fit(X_train, y_train)
# get_score(clf) # custom get_score function created above

## SVM

In [172]:
from sklearn.svm import SVC

In [173]:
# param_grid = {'C': [0.1, 1, 10, 100, 1000], 
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf']} 
  
# grid = GridSearchCV(SVC(), param_grid, refit = True)
# grid.fit(X_train, y_train)

# print( grid.best_score_, grid.best_params_)

# clf = grid.best_estimator_
# clf.fit(X_train, y_train)
# get_score(clf)

In [174]:
# clf = SVC(C=10, gamma=0.01, kernel='rbf')
# clf.fit(X_train, y_train)
# get_score(clf)

In [175]:
# clf = SVC(C=100, gamma=0.001, kernel='rbf')
# clf.fit(X_train, y_train)
# get_score(clf)

In [176]:
# clf = SVC()
# clf.fit(X_train, y_train)
# get_score(clf) # custom get_score function created above

In [177]:
# param_grid = {'C':[0.1,1,10],'gamma':[0.1, 1, 10], 'degree':[2,3]}
# grid = GridSearchCV(SVC(kernel='poly'),param_grid,refit = True)
# grid.fit(X_train, y_train)

# print( grid.best_score_, grid.best_params_)

# clf = grid.best_estimator_
# clf.fit(X_train, y_train)
# get_score(clf)

In [178]:
# param_grid = {'C':[0.1,1,10],'gamma':[0.1, 1, 10]}
# grid = GridSearchCV(SVC(kernel='rbf'),param_grid,refit = True)
# grid.fit(X_train, y_train)

# print( grid.best_score_, grid.best_params_)

# clf = grid.best_estimator_
# clf.fit(X_train, y_train)
# get_score(clf)

## CART, Bagging, MLP

### CART

In [179]:
from sklearn.tree import DecisionTreeClassifier

In [180]:
# clf = DecisionTreeClassifier()
# clf.fit(X_train, y_train)
# print(get_score(clf)) # custom get_score function created above

In [181]:
# params = {
#     "max_depth": list(range(1,8)),
#     "min_samples_split": [2,5,7,10],
#     "min_samples_leaf": [1,2,4,6]
# }

# grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=params, scoring="f1", n_jobs=-1, cv=cv)
# grid.fit(X_train, y_train)

# print( grid.best_score_, grid.best_params_)

# clf = grid.best_estimator_
# clf.fit(X_train, y_train)
# get_score(clf) # custom get_score function created above

### Bagging

In [182]:
from sklearn.ensemble import BaggingClassifier

In [183]:
# clf = BaggingClassifier(n_estimators=10, random_state=0) # default estimator - Decision Tree Classifier
# clf.fit(X_train, y_train)
# get_score(clf)

### Random Forest 

In [184]:
from sklearn.ensemble import RandomForestClassifier

In [185]:
# clf = RandomForestClassifier()
# clf.fit(X_train, y_train)
# get_score(clf)

In [186]:
# params = {'n_estimators': [1,2,5,10,20,40,80,160], 'max_leaf_nodes': [2,5,10,20]}

# #search_cv = RandomizedSearchCV(RandomForestClassifier(), param_distributions=params, scoring='f1', n_iter=10, random_state=0)

# grid = GridSearchCV(RandomForestClassifier(), param_grid=params, scoring='f1', cv=cv)
# grid.fit(X_train, y_train)

# print( grid.best_score_, grid.best_params_)

# clf = grid.best_estimator_
# clf.fit(X_train, y_train)
# get_score(clf)

## Boosting

### AdaBoost

In [187]:
from sklearn.ensemble import AdaBoostClassifier

In [188]:
# ada = AdaBoostClassifier()
# ada.fit(X_train, y_train)

# print(get_score(ada)) # custom get_score function created above

### GradientBoosting

In [189]:
from sklearn.ensemble import GradientBoostingClassifier

In [190]:
# gbc = GradientBoostingClassifier()
# gbc.fit(X_train, y_train)

# print(get_score(gbc)) # custom get_score function created above

### XGBoost

In [191]:
from xgboost import XGBClassifier

In [192]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

print(get_score(xgb)) # custom get_score function created above

val: 0.6715 
test: 0.6871 
cross_val 0.6188
None


## MLP

In [193]:
from sklearn.neural_network import MLPClassifier

In [194]:
# mlpc = MLPClassifier(max_iter=100)
# params = {
#     'hidden_layer_sizes': [(60,60),(80,40)],
#     'activation': ['tanh', 'relu'],
#     'solver': ['sgd', 'adam'],
#     'alpha': [0.0001, 0.05],
#     'learning_rate': ['constant','adaptive'],
# }


# grid = GridSearchCV(mlpc, params, n_jobs=-1, cv=cv, error_score=0)
# grid.fit(X_train, y_train)

# print( grid.best_score_, grid.best_params_)

# clf = grid.best_estimator_
# clf.fit(X_train, y_train)
# print(get_score(clf)) # custom get_score function created above

# End

**The best score was achieved using XGBoostingClassifier**

In [195]:
print(round((time.time()-start),2), "sec")

25.39 sec
