# Import libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns',300)

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Load datasets and check info

In [None]:
train = pd.read_csv('../input/amexpert-2019/train.csv')
train.head()

In [None]:
train.info()

In [None]:
train.coupon_id.nunique()

In [None]:
campaign = pd.read_csv('../input/amexpert-2019/campaign_data.csv')
campaign.head()

In [None]:
campaign.info()

In [None]:
coupon = pd.read_csv('../input/amexpert-2019/coupon_item_mapping.csv')
coupon.head()

In [None]:
coupon.info()

In [None]:
transaction = pd.read_csv('../input/amexpert-2019/customer_transaction_data.csv')
transaction.head()

In [None]:
train.shape

In [None]:
transaction.info()

In [None]:
item = pd.read_csv('../input/amexpert-2019/item_data.csv')
item.head()

In [None]:
item.category.unique()

In [None]:
demograph = pd.read_csv('../input/amexpert-2019/customer_demographics.csv')
demograph.head()

In [None]:
demograph.isnull().sum()/760

In [None]:
demograph.info()

# Data preprocessing

In [None]:
campaign['start_date'] = pd.to_datetime(campaign['start_date'])
campaign['end_date'] = pd.to_datetime(campaign['end_date'])
#pd.to_datetime(campaign['end_date'])

In [None]:
campaign['duration'] = abs((campaign['end_date'] -  campaign['start_date']).dt.days)

In [None]:
demograph.income_bracket.unique()

In [None]:
demograph['marital_status'] = demograph.groupby(['family_size','age_range'])\
['marital_status'].apply(lambda x: x.fillna(x.mode()[0]))

In [None]:
demograph.marital_status.unique()

In [None]:
demograph.drop('no_of_children',axis=1,inplace=True)

### check if balanced set

In [None]:
sns.countplot(train.redemption_status)

## Merge dataframes

In [None]:
mtc = pd.merge(train,campaign,on='campaign_id',how='left')
mtc.head()

In [None]:
mtc.shape

In [None]:
mci = pd.merge(coupon,item,on='item_id',how='left')
mci.head()

In [None]:
mci.coupon_id.nunique()

In [None]:
mci.shape

In [None]:
mci.groupby('coupon_id').count().reset_index()[['coupon_id','item_id']].head()

In [None]:
mci_group = pd.DataFrame()

In [None]:
mci_group[['coupon_id','category_count']] = \
mci.groupby('coupon_id').count().reset_index()[['coupon_id','item_id']]

In [None]:
mci.groupby('coupon_id').max().reset_index().head()

In [None]:
mci_group[['brand_type','category']] = \
mci.groupby('coupon_id').max().reset_index()[['brand_type','category']]

In [None]:
mci_group.head()

In [None]:
#tgroup = transaction.groupby(['customer_id','item_id','date']).sum().reset_index()
tgroup = transaction.groupby(['customer_id']).sum().reset_index()

In [None]:
tgroup.head()

In [None]:
tgroup.drop('item_id',axis=1,inplace=True)

In [None]:
tgroup.shape

In [None]:
mdtg = pd.merge(tgroup,demograph,on='customer_id',how='outer')
mdtg.head()

In [None]:
mdtg.shape

In [None]:
mergeddata = pd.merge(mtc,mdtg,on=['customer_id'],how='left')
mergeddata.head()

In [None]:
mergeddata.shape

In [None]:
mergeddata.info()

In [None]:
mergeddata.isnull().sum()/78369

## fill null values

In [None]:
mergeddata['marital_status'].fillna(mergeddata['marital_status'].mode()[0],inplace=True)

In [None]:
mergeddata['age_range'].fillna(mergeddata['age_range'].mode()[0],inplace=True)

In [None]:
mergeddata['family_size'].fillna(mergeddata['family_size'].mode()[0],inplace=True)

In [None]:
mergeddata['rented'].fillna(mergeddata['rented'].mode()[0],inplace=True)

In [None]:
mergeddata['income_bracket'].fillna(mergeddata['income_bracket'].median(),inplace=True)

In [None]:
mergeddata = pd.merge(mergeddata,mci_group,on=['coupon_id'],how='left')
mergeddata.head()

In [None]:
mergeddata.info()

# Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
lc = LabelEncoder()

In [None]:
mergeddata['age_range'] = lc.fit_transform(mergeddata['age_range'])
mergeddata['family_size'] = lc.fit_transform(mergeddata['family_size'])

In [None]:
mergeddata.info()

In [None]:
item.category.unique()

In [None]:
cat_list = ['Bakery', 'Packaged Meat', 'Seafood', 'Dairy, Juices & Snacks',
            'Prepared Food','Meat','Salads', 'Alcohol','Vegetables (cut)']

In [None]:
def mapCategory(x):
    if x in cat_list:
        return 'consumable'
    else:
        return 'non-consumable'

In [None]:
mergeddata['category'] = mergeddata['category'].apply(mapCategory)

In [None]:
mergeddata['final_price'] = mergeddata['selling_price']+ mergeddata['other_discount'] + mergeddata['coupon_discount']

In [None]:
mergeddata.drop(['selling_price','other_discount','coupon_discount'],axis=1,inplace=True)

In [None]:
mergeddata.income_bracket.unique()

In [None]:
sns.countplot(mergeddata.income_bracket)

In [None]:
def mapIncome(x):
    if (x<4):
        return 'low'
    elif (x>=4 and x<=7):
        return 'middle'
    elif (x>7 and x<=10):
        return 'upper-middle'
    elif (x>10):
        return 'high'

In [None]:
mergeddata['income_bracket'] = mergeddata['income_bracket'].apply(mapIncome)

In [None]:
inc_dict = {'low':1,'middle':2,'upper-middle':3,'high':4}
inc_dict

In [None]:
mergeddata['income_bracket'] = mergeddata['income_bracket'].map(inc_dict)

# One hot encoding

In [None]:
dummydata = pd.get_dummies(mergeddata.drop(['redemption_status','coupon_id','customer_id','id','campaign_id','start_date','end_date'],axis=1))
dummydata.head()

# RobustScaler

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
rc = RobustScaler()

In [None]:
scaledData = pd.DataFrame(rc.fit_transform(dummydata),columns=dummydata.columns)
scaledData.head()

In [None]:
x = scaledData
y = mergeddata['redemption_status']

In [None]:
test = pd.read_csv('../input/amexpert-2019/test_QyjYwdj.csv')
test.head()

# Test data processing

In [None]:
mtc_test = pd.merge(test,campaign,on='campaign_id',how='left')

mci_test = pd.merge(coupon,item,on='item_id')

mdtg_test = pd.merge(tgroup,demograph,on='customer_id',how='outer')
mergeddata_test = pd.merge(mtc_test,mdtg_test,on=['customer_id'],how='left')

mergeddata_test['marital_status'].fillna(mergeddata_test['marital_status'].mode()[0],inplace=True)
mergeddata_test['age_range'].fillna(mergeddata_test['age_range'].mode()[0],inplace=True)
mergeddata_test['family_size'].fillna(mergeddata_test['family_size'].mode()[0],inplace=True)
mergeddata_test['rented'].fillna(mergeddata_test['rented'].mode()[0],inplace=True)
mergeddata_test['income_bracket'].fillna(mergeddata_test['income_bracket'].median(),inplace=True)

mergeddata_test = pd.merge(mergeddata_test,mci_group,on=['coupon_id'],how='left')

mergeddata_test['age_range'] = lc.fit_transform(mergeddata_test['age_range'])
mergeddata_test['family_size'] = lc.fit_transform(mergeddata_test['family_size'])

mergeddata_test['category'] = mergeddata_test['category'].apply(mapCategory)
mergeddata_test['final_price'] = mergeddata_test['selling_price']+ mergeddata_test['other_discount'] + mergeddata_test['coupon_discount']
mergeddata_test.drop(['selling_price','other_discount','coupon_discount'],axis=1,inplace=True)
mergeddata_test['income_bracket'] = mergeddata_test['income_bracket'].apply(mapIncome)
mergeddata_test['income_bracket'] = mergeddata_test['income_bracket'].map(inc_dict)

dummydata_test = pd.get_dummies(mergeddata_test.drop(['coupon_id','customer_id','id','campaign_id','start_date','end_date'],axis=1))
scaledData_test = pd.DataFrame(rc.fit_transform(dummydata_test),columns=dummydata_test.columns)
scaledData_test.head()

In [None]:
mergeddata_test.info()

# Model Building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=0)

In [None]:
from sklearn.metrics import accuracy_score,roc_auc_score

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## LogisticRegression

In [None]:
lor = LogisticRegression()

In [None]:
lor.fit(xtrain,ytrain)
ypredlor = lor.predict(xtest)

In [None]:
accuracy_score(ytest,ypredlor)

In [None]:
yproba_yes = lor.predict_proba(xtest)[:,1]

In [None]:
roc_auc_score(ytest,yproba_yes)

## RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf.fit(xtrain,ytrain)
ypredrf = rf.predict(xtest)
print(accuracy_score(ytest,ypredrf))
yproba_yes = rf.predict_proba(xtest)[:,1]
roc_auc_score(ytest,yproba_yes)

In [None]:
lor.fit(x,y)
ypredlor = lor.predict(scaledData_test)
ypredlor

In [None]:
print(test.id.shape)
print(ypredlor.shape)
print(scaledData_test.shape)

In [None]:
transaction.shape

In [None]:
#mergeddata_test[mergeddata_test.duplicated()]

In [None]:
dummydata_test.shape

In [None]:
submission = pd.DataFrame({'id':test['id'],'redemption_status':ypredlor})
submission.head()

In [None]:
submission.to_csv('rahul_amex_lor.csv',index=False)

In [None]:
rf = RandomForestClassifier()
rf.fit(x,y)
ypredrf = rf.predict(scaledData_test)
submission = pd.DataFrame({'id':test['id'],'redemption_status':ypredrf})
submission.head()
submission.to_csv('rahul_amex_rf.csv',index=False)

In [None]:
param_rf = {
    'max_depth': [2, 3, 4],
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'criterion': ['gini', 'entropy']
}

In [None]:
#from sklearn.model_selection import GridSearchCV
# gridRf = GridSearchCV(rf, cv = 10,param_grid=param_rf,scoring='roc_auc')
# gridRf.fit(x,y)
# gridRf.best_params_

# XGBClassifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xg = XGBClassifier()

In [None]:
xg.fit(xtrain,ytrain)
ypredxg = xg.predict(xtest)
print(accuracy_score(ytest,ypredxg))
yproba_yes = xg.predict_proba(xtest)[:,1]
roc_auc_score(ytest,yproba_yes)

In [None]:
xg = XGBClassifier()
xg.fit(x,y)
ypredxg = xg.predict(scaledData_test)
submission = pd.DataFrame({'id':test['id'],'redemption_status':ypredxg})
submission.head()
submission.to_csv('rahul_amex_xgb.csv',index=False)

# AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier()

In [None]:
ada.fit(xtrain,ytrain)
ypredada = ada.predict(xtest)
print(accuracy_score(ytest,ypredada))
yproba_yes = ada.predict_proba(xtest)[:,1]
roc_auc_score(ytest,yproba_yes)

# GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gba = GradientBoostingClassifier()
gba.fit(xtrain,ytrain)
ypredgba = gba.predict(xtest)
print(accuracy_score(ytest,ypredgba))
yproba_yes = gba.predict_proba(xtest)[:,1]
roc_auc_score(ytest,yproba_yes)

In [None]:
gba = GradientBoostingClassifier()
gba.fit(x,y)
ypredgba = gba.predict(scaledData_test)
submission = pd.DataFrame({'id':test['id'],'redemption_status':ypredgba})
submission.head()
submission.to_csv('rahul_amex_gba.csv',index=False)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# GaussianNB

In [None]:
gba = GaussianNB()
gba.fit(xtrain,ytrain)
ypredgba = gba.predict(xtest)
print(accuracy_score(ytest,ypredgba))
yproba_yes = gba.predict_proba(xtest)[:,1]
roc_auc_score(ytest,yproba_yes)

# KNeighborsClassifier

In [None]:
gba = KNeighborsClassifier()
gba.fit(xtrain,ytrain)
ypredgba = gba.predict(xtest)
print(accuracy_score(ytest,ypredgba))
yproba_yes = gba.predict_proba(xtest)[:,1]
roc_auc_score(ytest,yproba_yes)

In [None]:
# gba = SVC()
# gba.fit(xtrain,ytrain)
# ypredgba = gba.predict(xtest)
# print(accuracy_score(ytest,ypredgba))
# yproba_yes = gba.predict_proba(xtest)[:,1]
# roc_auc_score(ytest,yproba_yes)

# downsampling NearMiss

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
nm = NearMiss()

In [None]:
downx,downy = nm.fit_sample(x.drop(['age_range','rented','campaign_type_Y','marital_status_Single',\
 'category_non-consumable','brand_type_Established','income_bracket'],axis=1),y)

In [None]:
xtrain2,xtest2,ytrain2,ytest2 = train_test_split(downx,downy,random_state=0)

In [None]:
rf = RandomForestClassifier(n_estimators=300)
rf.fit(xtrain2,ytrain2)
ypredrf2 = rf.predict(xtest2)
print(accuracy_score(ytest2,ypredrf2))
yproba_yes2 = rf.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
xg = XGBClassifier(n_estimators=150,learning_rate=0.2)
xg.fit(xtrain2,ytrain2)
ypredxg2 = xg.predict(xtest2)
print(accuracy_score(ytest2,ypredxg2))
yproba_yes2 = xg.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
ada = AdaBoostClassifier(n_estimators=150,learning_rate=0.4)
ada.fit(xtrain2,ytrain2)
ypredada2 = ada.predict(xtest2)
print(accuracy_score(ytest2,ypredada2))
yproba_yes2 = ada.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
gba = GradientBoostingClassifier(n_estimators=150,learning_rate=0.2)
gba.fit(xtrain2,ytrain2)
ypredgba2 = gba.predict(xtest2)
print(accuracy_score(ytest2,ypredgba2))
yproba_yes2 = gba.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

# upsampling SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE()

In [None]:
upx,upy = sm.fit_sample(x.drop(['age_range','rented','campaign_type_Y','marital_status_Single',\
 'category_non-consumable','brand_type_Established','income_bracket'],axis=1),y)

In [None]:
scaledData_test_drop = scaledData_test.drop(['age_range','rented','campaign_type_Y','marital_status_Single',\
                                'category_non-consumable','brand_type_Established','income_bracket'],axis=1)

In [None]:
xtrain2,xtest2,ytrain2,ytest2 = train_test_split(upx,upy,random_state=0)

In [None]:
rf = RandomForestClassifier(n_estimators=150)
rf.fit(xtrain2,ytrain2)
ypredrf2 = rf.predict(xtest2)
print(accuracy_score(ytest2,ypredrf2))
yproba_yes2 = rf.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
xg = XGBClassifier(n_estimators=250,learning_rate=0.8)
xg.fit(xtrain2,ytrain2)
ypredxg2 = xg.predict(xtest2)
print(accuracy_score(ytest2,ypredxg2))
yproba_yes2 = xg.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
gba = GradientBoostingClassifier(n_estimators=250,learning_rate=0.8)
gba.fit(xtrain2,ytrain2)
ypredgba2 = gba.predict(xtest2)
print(accuracy_score(ytest2,ypredgba2))
yproba_yes2 = gba.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
ada = AdaBoostClassifier(n_estimators=150,learning_rate=0.4)
ada.fit(xtrain2,ytrain2)
ypredada2 = ada.predict(xtest2)
print(accuracy_score(ytest2,ypredada2))
yproba_yes2 = ada.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

# Feature Selection

In [None]:
pd.Series(rf.feature_importances_,scaledData_test_drop.columns).plot.barh()

In [None]:
pd.Series(gba.feature_importances_,scaledData_test_drop.columns).plot.barh()

In [None]:
pd.Series(xg.feature_importances_,scaledData_test_drop.columns).plot.barh()

In [None]:
import statsmodels.api as sms

In [None]:
temp = x.copy()
temp['constant'] = 1

# Logit

In [None]:
sms.Logit(y,temp).fit().summary()

In [None]:
from sklearn.ensemble import VotingClassifier

up

In [None]:
# vf = VotingClassifier(estimators=estimator,voting='soft')
# vf.fit(xtrain2,ytrain2)
# ypredada2 = vf.predict(xtest2)
# print(accuracy_score(ytest2,ypredada2))
# yproba_yes2 = vf.predict_proba(xtest2)[:,1]
# roc_auc_score(ytest2,yproba_yes2)

down

In [None]:
# vf = VotingClassifier(estimators=estimator,voting='soft')
# vf.fit(xtrain2,ytrain2)
# ypredada2 = vf.predict(xtest2)
# print(accuracy_score(ytest2,ypredada2))
# yproba_yes2 = vf.predict_proba(xtest2)[:,1]
# roc_auc_score(ytest2,yproba_yes2)

In [None]:
# vf = VotingClassifier(estimators=estimator,voting='soft')
# vf.fit(upx,upy)
# ypredvf = vf.predict(scaledData_test)
# submission = pd.DataFrame({'id':test['id'],'redemption_status':ypredvf})
# submission.head()
# submission.to_csv('rahul_amex_vf.csv',index=False)

In [None]:
rf = RandomForestClassifier(n_estimators=150)
rf.fit(downx,downy)
ypredrf = rf.predict(scaledData_test_drop)
yproba_rf = rf.predict_proba(scaledData_test_drop)[:,1]
#submission = pd.DataFrame({'id':test['id'],'redemption_status':ypredrf})
submission = pd.DataFrame({'id':test['id'],'redemption_status':yproba_rf})
submission.to_csv('rahul_amex_rf.csv',index=False)

In [None]:
gba = GradientBoostingClassifier(n_estimators=250,learning_rate=0.8)
gba.fit(upx,upy)
ypredgba = gba.predict(scaledData_test_drop)
yproba_gba = gba.predict_proba(scaledData_test_drop)[:,1]

submission = pd.DataFrame({'id':test['id'],'redemption_status':yproba_gba})
submission.head()
submission.to_csv('rahul_amex_gba.csv',index=False)

In [None]:
xg = XGBClassifier(n_estimators=150,learning_rate=0.2)
xg.fit(downx,downy)
ypredxgb = xg.predict(scaledData_test_drop.values)
yproba_xgb = xg.predict_proba(scaledData_test_drop.values)[:,1]

submission = pd.DataFrame({'id':test['id'],'redemption_status':yproba_xgb})
submission.head()
submission.to_csv('rahul_amex_xgb.csv',index=False)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = [variance_inflation_factor(dummydata.drop(['age_range','rented','campaign_type_Y',\
                                        'marital_status_Single','category_non-consumable','brand_type_Established','income_bracket'],axis=1).values,i) for i in range(dummydata.drop(['age_range','rented','campaign_type_Y',\
                                        'marital_status_Single','category_non-consumable','brand_type_Established','income_bracket'],axis=1).shape[1])]
pd.Series(vif,index=dummydata.drop(['age_range','rented','campaign_type_Y',\
                                        'marital_status_Single','category_non-consumable','brand_type_Established','income_bracket'],axis=1).columns)

In [None]:
from sklearn.svm import SVC

In [None]:
# svm = SVC()
# svm.fit(xtrain2,ytrain2)
# ypredsvm = svm.predict(xtest2)
# print(accuracy_score(ytest2,ypredsvm))
# yproba_yes2 = svm.predict_proba(xtest2)[:,1]
# roc_auc_score(ytest2,yproba_yes2)

In [None]:
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# PCA

In [None]:
pca = PCA(n_components=5)

In [None]:
pcax = pca.fit_transform(upx)

In [None]:
xtrain2,xtest2,ytrain2,ytest2 = train_test_split(pcax,upy,random_state=0)

In [None]:
gba = GradientBoostingClassifier(n_estimators=150,learning_rate=0.4)
gba.fit(xtrain2,ytrain2)
ypredgba2 = gba.predict(xtest2)
print(accuracy_score(ytest2,ypredgba2))
yproba_yes2 = gba.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
xg = XGBClassifier(n_estimators=150,learning_rate=0.4)
xg.fit(xtrain2,ytrain2)
ypredxg2 = xg.predict(xtest2)
print(accuracy_score(ytest2,ypredxg2))
yproba_yes2 = xg.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

# Polynomial Features

In [None]:
pl = PolynomialFeatures(degree=3)

In [None]:
polyx = pl.fit_transform(upx)

In [None]:
polyxtest = pl.fit_transform(scaledData_test_drop)

In [None]:
xtrain2,xtest2,ytrain2,ytest2 = train_test_split(polyx,upy,random_state=0)

In [None]:
gba = GradientBoostingClassifier(n_estimators=150,learning_rate=0.5)
gba.fit(xtrain2,ytrain2)
ypredgba2 = gba.predict(xtest2)
print(accuracy_score(ytest2,ypredgba2))
yproba_yes2 = gba.predict_proba(xtest2)[:,1]
roc_auc_score(ytest2,yproba_yes2)

In [None]:
# gba = GradientBoostingClassifier(n_estimators=150,learning_rate=0.4)
# gba.fit(polyx.values,upy)
# ypredgba = gba.predict(polyxtest.values)
# yproba_gba = gba.predict_proba(polyxtest.values)[:,1]

# submission = pd.DataFrame({'id':test['id'],'redemption_status':yproba_gba})
# submission.head()
# submission.to_csv('rahul_amex_gba.csv',index=False)