**Marketing Campaign Dataset**

In [607]:
#Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import KFold 
from sklearn.model_selection import RepeatedKFold

In [608]:
#Load data
marketing_train = pd.read_csv("marketing_tr.csv")

**Checking the shape **

In [609]:
marketing_train.shape

(7414, 21)

**Checking the Head**

In [610]:
marketing_train.head(5)

Unnamed: 0,custAge,profession,marital,schooling,housing,loan,contact,month,day_of_week,campaign,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,pmonths,pastEmail,responded
0,55.0,admin.,single,university.degree,no,no,cellular,nov,mon,1,...,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,999.0,0,no
1,,blue-collar,married,,no,no,cellular,jul,mon,1,...,0,nonexistent,1.4,93.918,-42.7,4.96,5228.1,999.0,0,no
2,42.0,technician,married,high.school,no,no,telephone,may,mon,1,...,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,999.0,0,no
3,55.0,management,divorced,,yes,yes,cellular,jul,wed,2,...,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,999.0,0,no
4,,admin.,divorced,university.degree,yes,no,cellular,may,tue,5,...,1,failure,-1.8,92.893,-46.2,1.291,5099.1,999.0,1,no


**Exploratory Data Analysis**

In [611]:
marketing_train['schooling'] = marketing_train['schooling'].replace("illiterate", "unknown")
marketing_train['schooling'] = marketing_train['schooling'].replace(["basic.4y","basic.6y","basic.9y","high.school","professional.course"], "high.school")

In [612]:
marketing_train['month'] = marketing_train['month'].replace(["oct","nov","dec"], "dec")
marketing_train['month'] = marketing_train['month'].replace(["aug","jul","sep"], "sep")
marketing_train['month'] = marketing_train['month'].replace(["apr","jun","may","mar"], "jun")

**Replacing Yes & No with 1's & 0's **

In [613]:
marketing_train['marital'] = marketing_train['marital'].replace("unknown", "married")
marketing_train['loan'] = marketing_train['loan'].replace("unknown", "no")
marketing_train['housing'] = marketing_train['housing'].replace("unknown", "no")
marketing_train['housing'] = marketing_train['housing'].replace("yes",1)
marketing_train['housing'] = marketing_train['housing'].replace("no", 0)
marketing_train['loan'] = marketing_train['loan'].replace("yes", 1)
marketing_train['loan'] = marketing_train['loan'].replace("no", 0)

In [614]:
marketing_train['profession'] = marketing_train['profession'].replace(["management","unknown","unemployed","admin."], "admin.")
marketing_train['profession'] = marketing_train['profession'].replace(["blue-collar","housemaid","services","self-employed","entrepreneur","technician"], "blue-collar")

**Missing Value Analysis**

In [615]:
#Create dataframe with missing percentage
missing_val = pd.DataFrame(marketing_train.isnull().sum())

#Reset index
missing_val = missing_val.reset_index()

#Rename variable
missing_val = missing_val.rename(columns = {'index': 'Variables', 0: 'Missing_percentage'})

#Calculate percentage
missing_val['Missing_percentage'] = (missing_val['Missing_percentage']/len(marketing_train))*100

#descending order
missing_val = missing_val.sort_values('Missing_percentage', ascending = False).reset_index(drop = True)

In [616]:
missing_val.head()

Unnamed: 0,Variables,Missing_percentage
0,schooling,29.066631
1,custAge,24.332344
2,day_of_week,9.589965
3,poutcome,0.0
4,pastEmail,0.0


In [617]:
marketing_train['schooling'].value_counts()

high.school          3473
university.degree    1554
unknown               232
Name: schooling, dtype: int64

**Imputing Missing Values**

In [618]:
#imputation method
#Actual value = 29
#Mean = 40.01
#Median = 38

In [619]:
#Checking value
marketing_train['custAge'].loc[70]

29.0

In [620]:
#create missing value
marketing_train['custAge'].loc[70] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [621]:
marketing_train['custAge'].loc[70]

nan

In [622]:
#Impute with median - CustAge.
marketing_train['custAge'] = marketing_train['custAge'].fillna(marketing_train['custAge'].median())

In [623]:
marketing_train['custAge'].loc[70]

38.0

**#Imputing Schooling**

In [624]:
marketing_train['schooling'] = marketing_train.groupby("profession")['schooling'].transform(lambda x: x.fillna(x.mode()[0]))

** #Imputing "day_of_week" **

In [625]:
marketing_train['day_of_week'] = marketing_train.groupby(["profession",'month'])['day_of_week'].transform(lambda x: x.fillna(x.mode()[0]))

In [626]:
missing_val = pd.DataFrame(marketing_train.isnull().sum())

In [627]:
missing_val

Unnamed: 0,0
custAge,0
profession,0
marital,0
schooling,0
housing,0
loan,0
contact,0
month,0
day_of_week,0
campaign,0


In [628]:
#Converting Categorical to Numerical
marketing_train=pd.get_dummies(marketing_train,columns=['profession',
            'marital','schooling','contact','month','day_of_week','poutcome'])

In [629]:
marketing_train.shape

(7414, 37)

In [630]:
marketing_train.head()

Unnamed: 0,custAge,housing,loan,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,month_jun,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,55.0,0,0,1,999,0,-0.1,93.2,-42.0,4.191,...,0,0,0,1,0,0,0,0,1,0
1,38.0,0,0,1,999,0,1.4,93.918,-42.7,4.96,...,0,1,0,1,0,0,0,0,1,0
2,42.0,0,0,1,999,0,1.1,93.994,-36.4,4.857,...,1,0,0,1,0,0,0,0,1,0
3,55.0,1,1,2,999,0,1.4,93.918,-42.7,4.962,...,0,1,0,0,0,0,1,0,1,0
4,38.0,1,0,5,999,1,-1.8,92.893,-46.2,1.291,...,1,0,0,0,0,1,0,1,0,0


In [631]:
def vif_cal(input_data, dependent_col):
    import statsmodels.formula.api as sm
    x_vars = input_data.drop([dependent_col],axis=1)
    xvar_names = x_vars.columns
    for i in range(0,len(xvar_names)):
        y = x_vars[xvar_names[i]]
        x = x_vars[xvar_names.drop(xvar_names[i])]
        rsq = sm.ols("y~x",x_vars).fit().rsquared
        vif = round(1/(1-rsq),2)
        print(xvar_names[i], "VIF: ", vif)
        
vif_cal(marketing_train,'responded')

custAge VIF:  1.41
housing VIF:  1.02
loan VIF:  1.01
campaign VIF:  1.05
pdays VIF:  88765.85
previous VIF:  7.54
emp.var.rate VIF:  61.77
cons.price.idx VIF:  9.61
cons.conf.idx VIF:  3.83
euribor3m VIF:  129.37
nr.employed VIF:  42.81
pmonths VIF:  88068.52
pastEmail VIF:  2.8


  if __name__ == '__main__':


profession_admin. VIF:  inf
profession_blue-collar VIF:  inf
profession_retired VIF:  inf
profession_student VIF:  inf
marital_divorced VIF:  inf
marital_married VIF:  inf
marital_single VIF:  inf
schooling_high.school VIF:  inf
schooling_university.degree VIF:  inf
schooling_unknown VIF:  inf
contact_cellular VIF:  inf
contact_telephone VIF:  inf
month_dec VIF:  inf
month_jun VIF:  inf
month_sep VIF:  inf
day_of_week_fri VIF:  inf
day_of_week_mon VIF:  inf
day_of_week_thu VIF:  inf
day_of_week_tue VIF:  inf
day_of_week_wed VIF:  inf
poutcome_failure VIF:  inf
poutcome_nonexistent VIF:  inf
poutcome_success VIF:  inf


In [632]:
# Avoiding the Dummy Variable Trap
marketing_train = marketing_train.drop(['profession_blue-collar'],1)
marketing_train = marketing_train.drop(['poutcome_success'],1)
marketing_train = marketing_train.drop(['month_jun'],1)
marketing_train = marketing_train.drop(['poutcome_nonexistent'],1)
marketing_train = marketing_train.drop(['marital_divorced'],1)
marketing_train = marketing_train.drop(['contact_cellular'],1)
marketing_train = marketing_train.drop(['pmonths'],1)
marketing_train = marketing_train.drop(['euribor3m'],1)
marketing_train = marketing_train.drop(['nr.employed'],1)
marketing_train = marketing_train.drop(['day_of_week_thu'],1)
marketing_train = marketing_train.drop(['schooling_unknown'],1)
marketing_train = marketing_train.drop(['schooling_high.school'],1)
marketing_train = marketing_train.drop(['previous'],1)

In [633]:
vif_cal(marketing_train,'responded')

custAge VIF:  1.4
housing VIF:  1.01
loan VIF:  1.01
campaign VIF:  1.04
pdays VIF:  1.81
emp.var.rate VIF:  4.39
cons.price.idx VIF:  3.88
cons.conf.idx VIF:  1.58
pastEmail VIF:  2.19
profession_admin. VIF:  1.57
profession_retired VIF:  1.25
profession_student VIF:  1.1
marital_married VIF:  2.54
marital_single VIF:  2.8
schooling_university.degree VIF:  1.57
contact_telephone VIF:  3.13
month_dec VIF:  1.36
month_sep VIF:  2.66
day_of_week_fri VIF:  1.54
day_of_week_mon VIF:  1.72
day_of_week_tue VIF:  1.57
day_of_week_wed VIF:  1.54
poutcome_failure VIF:  1.83


In [634]:
X = marketing_train.drop('responded',axis=1)
X=np.array(X)
y = marketing_train['responded']

**Train test split**

In [635]:
from sklearn.model_selection import train_test_split
y = marketing_train['responded']
X = marketing_train.drop('responded',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

**Applying StandardScalar**

In [636]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train_scale = scale.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scale)
X_test_scale =scale.fit_transform(X_test)
X_test = pd.DataFrame(X_test_scale)

In [637]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,-0.702128,0.967785,-0.422156,-0.570718,0.205363,-0.07928,-0.625309,-0.293429,-0.293992,1.312055,...,1.567709,1.479214,-0.744564,2.656684,-0.70044,2.216956,-0.595354,-0.476552,-0.462286,-0.351368
1,-1.892188,-1.033288,-0.422156,-0.570718,0.205363,-1.158267,-1.152548,-1.192616,-0.293992,-0.762163,...,1.567709,-0.676035,-0.744564,-0.376409,-0.70044,-0.451069,-0.595354,-0.476552,-0.462286,-0.351368
2,-0.161191,0.967785,-0.422156,0.189146,0.205363,0.872767,0.607779,-0.443294,-0.293992,-0.762163,...,-0.637873,-0.676035,-0.744564,-0.376409,1.427673,-0.451069,1.679674,-0.476552,-0.462286,-0.351368
3,1.461618,0.967785,-0.422156,2.468737,0.205363,0.682358,0.738301,0.905486,-0.293992,-0.762163,...,-0.637873,-0.676035,1.343068,-0.376409,-0.70044,2.216956,-0.595354,-0.476552,-0.462286,-0.351368
4,-0.161191,0.967785,-0.422156,-0.570718,0.205363,0.872767,1.547193,-0.250611,-0.293992,-0.762163,...,-0.637873,-0.676035,1.343068,-0.376409,-0.70044,-0.451069,1.679674,-0.476552,-0.462286,-0.351368


In [316]:
#Applying PCA
#from sklearn.decomposition import PCA
#pca=PCA()
#X_train=pca.fit_transform(X_train)

In [317]:
#np.cumsum(pca.explained_variance_ratio_)

array([0.12752921, 0.22169254, 0.29476343, 0.35895667, 0.42120141,
       0.47928334, 0.53226119, 0.58411533, 0.6344186 , 0.68088729,
       0.72610798, 0.76738678, 0.80734317, 0.84455354, 0.88133735,
       0.91454443, 0.93857445, 0.95594253, 0.96707922, 0.97804645,
       0.98733671, 0.99443926, 1.        ])

**Building the Model**

**Decision Tree Model**

In [638]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

**Prediction**

In [639]:
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))     

             precision    recall  f1-score   support

         no       0.92      0.89      0.91      1664
        yes       0.25      0.31      0.28       190

avg / total       0.85      0.83      0.84      1854



In [640]:
print(confusion_matrix(y_test,predictions))
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

[[1489  175]
 [ 131   59]]


0.8349514563106796

In [None]:
#n-estimator =900
#F-score of "yes"-0.31
#n-estimator =600
#F-score of "yes"-0.34

**Random Forest model**

In [641]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)

In [642]:
#Fit the model
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [643]:
predictions = rfc.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

         no       0.92      0.98      0.95      1664
        yes       0.55      0.27      0.36       190

avg / total       0.88      0.90      0.89      1854



In [644]:
print(confusion_matrix(y_test,predictions))
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

[[1623   41]
 [ 139   51]]


0.9029126213592233

**Logistic regression model**

In [645]:
#Fit the model
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [646]:
#prediction
predictions = logmodel.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

         no       0.92      0.98      0.95      1664
        yes       0.58      0.23      0.33       190

avg / total       0.88      0.90      0.88      1854



In [647]:
accuracy_score(y_test, predictions)
print(confusion_matrix(y_test,predictions))

[[1633   31]
 [ 147   43]]


***Naive Bayes Model ***


In [648]:
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None)

In [649]:
#Prediction
y_pred=classifier.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

         no       0.92      0.98      0.95      1664
        yes       0.58      0.23      0.33       190

avg / total       0.88      0.90      0.88      1854



In [650]:
accuracy_score(y_test, predictions)
print(confusion_matrix(y_test,predictions))

[[1633   31]
 [ 147   43]]


**SVM Model**

In [651]:
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

**Model Evaluation**

In [652]:
predictions = svc_model.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

         no       0.91      0.98      0.95      1664
        yes       0.58      0.18      0.28       190

avg / total       0.88      0.90      0.88      1854

