# Essentials

In [None]:
import time, datetime
start = time.time()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

In [None]:
raw_train = pd.read_csv("/kaggle/input/marketing-strategy-personalised-offer/train_data.csv")
raw_test = pd.read_csv("/kaggle/input/marketing-strategy-personalised-offer/test_data.csv")

## Data statistics

In [None]:
raw_train.shape, raw_test.shape, raw_train.dtypes.sort_values().value_counts()

In [None]:
raw_train.isna().sum().sort_values(ascending=False)

In [None]:
features = raw_train.copy()
test = raw_test.copy()

labels = features["Offer Accepted"].copy()
features.drop('Offer Accepted', axis=1, inplace=True)

features.drop('car', axis=1, inplace=True)
test.drop('car', axis=1, inplace=True)

#features.dtypes.sort_values()

In [None]:
labels.unique()

**Problem is of binary classification type - (Yes / No)**

In [None]:
#features.isna().sum().sort_values(ascending=False)

* Features and labels have been split
* Features have categorical and numerical values
* Missing values are only in the categorical features
    * Since feature 'car' has the missing values comparable to data, thus can be dropped

### Phase 1
# Data preprocessing

**Data wrangling**
* Feature imputation
* Categorical and numerical transformers
* Feature scaling

In [None]:
#before = x.shape

# shorthand code - drop columns that only contain one value
features.drop([i for i in features.columns if len(features[i].unique())==1], axis=1, inplace=True)

# repeat for test data
test.drop([i for i in test.columns if len(test[i].unique())==1], axis=1, inplace=True)

#after = x.shape
#before, after

In [None]:
# list numerical and categorical features

numerical, categorical = [], []

for i in features.columns:
    n = features[i].dtype
    if (n=='int64'):
        numerical.append(i)
    elif (n=='O'):
        categorical.append(i)
        
len(categorical)+len(numerical)==len(features.columns)

## Feature imputation

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
si = SimpleImputer(strategy="most_frequent")

# separate dataframes for categorical and numerical features
numdf_tr = features[numerical]
catdf_tr = features[categorical]

# impute missing values in categorical features and concatnate with numerical
catdf_tr = pd.DataFrame(si.fit_transform(catdf_tr), columns=catdf_tr.columns)
features = pd.concat([catdf_tr, numdf_tr], axis=1)

# repeat for test data
# seaprate
numdf_te = test[numerical]
catdf_te = test[categorical]
# impute
catdf_te = pd.DataFrame(si.fit_transform(catdf_te), columns=catdf_tr.columns) # putting catdf_tr columns as a check
test = pd.concat([catdf_te, numdf_te], axis=1)

features.head(2)

**Features imputation done, now we can apply encoders**

**Categorical features should be divided into ordinal and nominal and encoders should be applied accordingly**

In [None]:
unique = {}
for i in sorted(categorical):
    unique[i] = features[i].unique()
    
unique = pd.DataFrame(unique.items(), columns=['features', 'unique values'])
unique

In [None]:
features['age'].value_counts()

**We can group values in ['age'] into bins of below21, 21-50 and 50plus**

In [None]:
# create a series copy of age column
age_tr = features['age'].copy()

# iterate through to find values between 21 and 50
# since age is in string format, we search by eliminating

for i in range(len(age_tr)):
    n = age_tr.loc[i]
    if (n=='50plus' or n=='below21'):
        pass
    else:
        age_tr = age_tr.replace(n, '21to50')

# drop original age column and add transformed age searies column
features.drop('age', axis=1, inplace=True)
features['age'] = age_tr
features['age'].value_counts()

In [None]:
# repeat for test data

age_te = test['age'].copy()

for i in range(len(age_te)):
    n = age_te.loc[i]
    if (n=='50plus' or n=='below21'):
        pass
    else:
        age_te = age_te.replace(n, '21to50')

test.drop('age', axis=1, inplace=True)
test['age'] = age_te
test['age'].value_counts()

In [None]:
# checking unique values to separate ordinal and nominal features

#unique

**From above, we separate ordinal features for OrdinalEncoder and nominal features for OneHotEncoder**

    Indices for 
        Ordinal = 4, 5, 6, 7, 10, 11, 12, 13, 14

In [None]:
ordinal = [(sorted(categorical))[i] for i in [4, 5, 6, 7, 10, 11, 12, 13, 14]]
nominal = [i for i in categorical if i not in ordinal]

print(len(ordinal), len(nominal), "\n", ordinal)

In [None]:
count = 0
for i in nominal:
    count += len(features[i].unique())

print("unique values in nominal features =", count)

**47 unique columns from nominal features and 9 from ordinal features should finally give us 56 columns for categorical features**

In [None]:
unique = {}
for i in ordinal:
    unique[i] = features[i].unique()
pd.DataFrame(unique.items(), columns=['features', 'unique values'])

**Specifying categories for ordinal features for OrdinalEncoding**

In [None]:
category = {
    'income_range': ['Less than ₹12500', '₹12500 - ₹24999', '₹25000 - ₹37499', '₹37500 - ₹49999', '₹50000 - ₹62499', '₹62500 - ₹74999', '₹75000 - ₹87499', '₹87500 - ₹99999', '₹100000 or More'],
    'Restaur_spend_greater_than20': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'no_visited_Cold drinks': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'Restaur_spend_less_than20': ['never', 'less1', '1~3', '4~8', 'gt8'], 
    'age': ['below21', '21to50', '50plus'],
    'no_visited_bars': ['never', 'less1', '1~3', '4~8', 'gt8'],
    'no_Take-aways' : ['never', 'less1', '1~3', '4~8', 'gt8'],
    'Qualification' : ['Some High School', 'High School Graduate', 'Associates degree', 'Some college - no degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
    'offer expiration': ['2days', '10hours']
}

## ColumnTransformer

**Combining categorical encoders and numerical scalers in ColumnTransformer**

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# checking if total columns match before we add new ones
len(features.columns) == len(ordinal) + len(nominal) + len(numerical)

In [None]:
feature = list(category.keys())
cols = list(category.values())

ct = ColumnTransformer([
    ("ordenc", OrdinalEncoder(categories=[i for i in cols]), [j for j in feature]),
    ("onehotenc", OneHotEncoder(), [j for j in nominal]),
    ("minmax", MinMaxScaler(), [j for j in numerical])
], sparse_threshold=0)

# instead of sparse_threshold, alt we can use sparse=False in OneHotEncoder

features = pd.DataFrame(ct.fit_transform(features))
features.columns = [str(i) for i in range(1, features.shape[1]+1)]

test = pd.DataFrame(ct.fit_transform(test))
test.columns = [str(i) for i in range(1, test.shape[1]+1)]
features.head(1)

In [None]:
labels = pd.Series(LabelEncoder().fit_transform(labels))
labels.value_counts()

## Summary

    Loaded data
    Separated features and labels
    Dropped columns like empty, single valued, etc.
    Imputed missing values (only in categorical in this data)
    Listed ordinal, nominal and numerical features
    Applied OrdinalEncoder and OneHotEncoder for ordinal and nominal features respectively
    Scaled Numerical features using MinMaxScaler (alt. StandardScaler)
    Encoded labels using LabelEncoder
    Simultaneously pre-processed train and test data to avoid errors

### Phase 2
# Dimensionality reduction

In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, GenericUnivariateSelect, mutual_info_regression, chi2

In [None]:
from sklearn.decomposition import PCA

In [None]:
features.var().sort_values(ascending=False)

In [None]:
pca = PCA(n_components=3)

pca_features = pca.fit_transform(features)
pca_test = pca.fit_transform(test)

### Phase 3

# Model selection

In [None]:
from sklearn.model_selection import train_test_split

x, X_test, y, y_test = train_test_split(features ,labels , test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(x ,y , test_size=0.2, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [None]:
def get_score(model):
    print("val:",round(f1_score(y_val, model.predict(X_val)),4), "\ntest:", round(f1_score(y_test, model.predict(X_test)),4))

def make_submission(clf, path):
    pred = pd.DataFrame(clf.predict(test)).replace({1:"Yes", 0:"No"})

    submission = pd.concat([pd.DataFrame(list(range(len(pred)))), pred], axis=1)
    submission.columns = ['id', 'Offer Accepted']

    submission.to_csv(path+"submissions/submission.csv", index=None)

## Baseline Model

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
clf = DummyClassifier()
clf.fit(X_train, y_train)

get_score(clf)

In [None]:
pd.Series(clf.predict(X_test)).unique()

## SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
clf = SGDClassifier()
clf.fit(X_train, y_train)

get_score(clf)

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
penalties = ['elasticnet', 'l1', 'l2', 'none']

for i in penalties:
    if (i=='elasticnet'):
        log_reg = LogisticRegression(solver='saga', penalty=i, l1_ratio=0.5)
    else:
        log_reg = LogisticRegression(solver='saga', penalty=i)
    
    log_reg.fit(X_train, y_train)
    print(i, get_score(log_reg), "\n")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
params = {
    "n_neighbors":list(range(1,20))
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid=params, scoring="f1", n_jobs=-1, cv=3).fit(X_train, y_train)
grid.best_score_, grid.best_params_

In [None]:
clf = grid.best_estimator_

clf.fit(X_train, y_train)

get_score(clf) # custom get_score function created above

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
param_grid = {'C':[0.1,1,10],'gamma':[0.1, 1, 10], 'degree':[2,3]}
grid = GridSearchCV(SVC(kernel='poly'),param_grid,refit = True)
grid.fit(X_train,y_train)

clf = grid.best_estimator_
clf.fit(X_train, y_train)
get_score(clf)

In [None]:
param_grid = {'C':[0.1,1,10],'gamma':[0.1, 1, 10]}
grid = GridSearchCV(SVC(kernel='rbf'),param_grid,refit = True)
grid.fit(X_train,y_train)

clf = grid.best_estimator_
clf.fit(X_train, y_train)
get_score(clf)

In [None]:
grid.best_params_

## CART

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
params = {
    "max_depth": list(range(1,10)),
    "min_samples_split": [2,5,7,10],
    "min_samples_leaf": [1,2,5]
}

grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid=params, scoring="f1", n_jobs=-1, cv=3).fit(X_train, y_train)
grid.best_score_, grid.best_params_

In [None]:
clf = grid.best_estimator_

clf.fit(X_train, y_train)

get_score(clf) # custom get_score function created above

## Boosting

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier()

ada.fit(X_train, y_train)

get_score(ada), sorted(cross_val_score(ada, X_train, y_train, cv=cv, scoring='f1'))

### GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

get_score(gbc), sorted(cross_val_score(gbc, X_train, y_train, cv=cv, scoring='f1'))

### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

get_score(xgb), sorted(cross_val_score(xgb, X_train, y_train, cv=cv, scoring='f1'))

### Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
params = {'n_estimators': [1,2,5,10,20,40,100,200,500], 'max_leaf_nodes': [2,5,10,20,50,100]}

#search_cv = RandomizedSearchCV(RandomForestClassifier(), param_distributions=params, 
#                               scoring='f1', n_iter=10, random_state=10)
grid = GridSearchCV(RandomForestClassifier(), param_grid=params, scoring='f1')

grid.fit(X_train, y_train)

In [None]:
clf = grid.best_estimator_
clf.fit(X_train, y_train)
get_score(clf)

## MLP

In [None]:
from sklean.nueral_network import MLP

# End

In [None]:
print(round((time.time()-start),2), "sec")