In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
url = "../Data/bank.csv"
BankData = pd.read_csv(url)
BankData.head(5)

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,30,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown,no
1,33,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure,no
2,35,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure,no
3,30,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown,no
4,59,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown,no


In [3]:
BankData.describe()

Unnamed: 0,age,balance,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,3025.0,50.0,871.0,25.0


In [4]:
BankData['y'].unique()  #you can use unique if you would like to find out how many unique attributes each variable have

array(['no', 'yes'], dtype=object)

For dictionary of data please refer to https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

#### Our goal is to define a model best predicts outcome y - success of the marketing campaign 

First let's create dummy variables for default, marital, housing, and loan

In [5]:
Default_dummy  = pd.get_dummies(BankData['default'], prefix = 'default')
del Default_dummy['default_no']

marital_dummy  = pd.get_dummies(BankData['marital'], prefix = 'marital')
del marital_dummy['marital_married']
del marital_dummy['marital_divorced']

housing_dummy  = pd.get_dummies(BankData['housing'], prefix = 'housing')
del housing_dummy['housing_no']

loan_dummy = pd.get_dummies(BankData['loan'], prefix = 'loan')
del loan_dummy['loan_no']


BankData = pd.concat([BankData,marital_dummy , Default_dummy, housing_dummy, loan_dummy], axis=1)
BankData.head()



Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y,marital_single,default_yes,housing_yes,loan_yes
0,30,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown,no,0,0,0,0
1,33,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure,no,0,0,1,1
2,35,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure,no,1,0,1,0
3,30,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown,no,0,0,1,1
4,59,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown,no,0,0,1,0


In [6]:
X = BankData[['age','balance','duration','campaign','pdays','previous','marital_single','default_yes','housing_yes','loan_yes']]
y = BankData['y']

#### Run a Logistic Regression Line on your inputs and output

In [10]:
logreg = LogisticRegression()
logreg.fit(X,y)
cols = ['age','balance','duration','campaign','pdays','previous','marital_single','default_yes','housing_yes','loan_yes']
# print 
# print logreg.coef_
print "Intercept",logreg.intercept_
zip(cols,logreg.coef_[0])

Intercept [-3.05725819]


[('age', 0.0051429265832905122),
 ('balance', 1.0957510862913199e-05),
 ('duration', 0.0037811246979828645),
 ('campaign', -0.088491740844304209),
 ('pdays', 0.0024235422412227825),
 ('previous', 0.094805874502587997),
 ('marital_single', 0.26441639441819964),
 ('default_yes', 0.20313288795293041),
 ('housing_yes', -0.9412700412106767),
 ('loan_yes', -0.79258474799341727)]

#### What is your 10-fold cross-validation error?

In [None]:
#sklearn.cross_validation.cross_val_score
#(estimator, X, y=None, scoring=None, cv=None, 
#n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs')[source]
cross_val_score(logreg,X,y,cv=10)

#### Construct a confusion matrix.

In [None]:
y_hat = logreg.predict(X)
confusion_matrix(y,y_hat)

#### Interpret your coefficients. (At least interpret campaign, marital_single, and default_yes. Do your interpretations  make sense?

Answer: 

#### What is your prediction for a person who is 30 years old, 1000 dollars balance, with duration = 210 , has been contacted 3 times for this campaign (campaign = 3), with pdays = 100, who has previously been contacted 4 times, who is single, never defaulted, home owner and doesn't have any loan?

#### Now standardize your data - you can use standardization method used for KNN algorithms.

In [None]:
def Standardize(X):
    X_Max = X.max()
    X_Min = X.min()
    X_Standardized = (X-X_Min)/(X_Max - X_Min)
    return X_Standardized



#### Use 10-fold cross validation to find the best tuning parameter - C.

#### Now use the best C you found above and repeat your analysis and look over your coefficients

In [None]:
#It will be easier for you to zip the name of variables and your coefficients


#### If you would like to drop 3 variables from your analysis, which variables are you going to choose?

Answer: 