In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
data_dir="G:\Banglalink\Python Master\PythonForModellers-master\Data"
os.chdir(data_dir)

In [None]:
data=pd.read_csv("dm.csv",na_values=[""," ","NA","N/A"])

In [None]:
data.head()

In [None]:
## Assume people who spend more than the average are good customers
data['target']=data['AmountSpent'].map(lambda x: 1 if x>data['AmountSpent'].mean() else 0)

In [None]:
data=data.drop("AmountSpent",axis=1)

In [None]:
data.head()

In [None]:
data['History'].value_counts()

In [None]:
data['History'].isnull().sum()

In [None]:
## Minimal Data Prep
data['History']=data['History'].fillna("NewCust")

In [None]:
data.head()

In [None]:
## Split the data into test and train
data_train=data.sample(frac=0.70,random_state=200)
data_test=data.drop(data_train.index)

In [None]:
## Build Model
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
model1=smf.glm("target~C(Age)+C(Gender)+C(OwnHome)+C(Married)+C(Location)+Salary+Children+C(History)+Catalogs",data=data_train,
              family=sm.families.Binomial()).fit()

In [None]:
print(model1.summary())

In [None]:
## Variables to exclude
#Age
#Gender
#Ownhome
#Married
## Variables for dummy creation
#Hist_Low
#Hist_Medium

data_train['Hist_Low']=data_train['History'].map(lambda x: 1 if x=="Low" else 0)
data_test['Hist_Low']=data_test['History'].map(lambda x: 1 if x=="Low" else 0)
data_train['Hist_Med']=data_train['History'].map(lambda x: 1 if x=="Medium" else 0)
data_test['Hist_Med']=data_test['History'].map(lambda x: 1 if x=="Medium" else 0)

In [None]:
model2=smf.glm("target~Children+Catalogs+Salary+Hist_Med",data=data_train,
              family=sm.families.Binomial()).fit()

In [None]:
print(model2.summary())

In [None]:
## Let's check confusion matrix and AUC
import sklearn.metrics as metrics

In [None]:
y_true=data_test['target']
y_pred=model2.predict(data_test)

In [None]:
y_pred.head()

In [None]:
y_true=data_test['target']
y_pred=model2.predict(data_test).map(lambda x:1 if x>0.5 else 0)
metrics.confusion_matrix(y_true,y_pred)

In [None]:
## ROC curve
y_score=model2.predict(data_test)
fpr,tpr,thresholds=metrics.roc_curve(y_true,y_score)
x,y=np.arange(0,1.1,0.1),np.arange(0,1.1,0.1)

In [None]:
plt.plot(fpr,tpr,"-")
plt.plot(x,y,'b--')

In [None]:
## AUC
metrics.roc_auc_score(y_true,y_score)

In [None]:
## Gains
data_test['prob']=model2.predict(data_test)

In [None]:
data_test['prob'].head()

In [None]:
data_test['prob_deciles']=pd.qcut(data_test['prob'],q=10)

In [None]:
data_test.head()

In [None]:
data_test.sort_values('prob',ascending=False).head()

In [None]:
gains=data_test.groupby("prob_deciles",as_index=False)['target'].agg(['sum','count']).reset_index().sort_values("prob_deciles",
                 ascending=False)

In [None]:
gains.columns=["Deciles","TotalEvents","NumberObs"]

In [None]:
gains["PercEvents"]=gains['TotalEvents']/gains['TotalEvents'].sum()

In [None]:
gains["CumulativeEvents"]=gains.PercEvents.cumsum()

In [None]:
gains

In [None]:
data_test.sort_values("prob",ascending=False)[['Cust_Id']].head(90)## These are the people to target