In [1]:
# group members: Sammy, Akhtar, Chad, Tony
import os
import statsmodels.api as sm

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

from sklearn import linear_model, cross_validation
import matplotlib.pyplot as plt
from sklearn import preprocessing, neighbors, grid_search, cross_validation, linear_model, metrics

%matplotlib inline
plt.style.use('ggplot')



In [None]:
# Sammy, Akhtar, Chad, Tony

In [2]:
df = pd.read_csv(os.path.join('dataset-12-walget-train.csv')) # read dataset 

In [3]:
df.dropna(subset = ['AccountHolderAddress',
'RecentlyPurchasedPregnancyTest',
'RecentlyPurchasedBirthControl',
'RecentlyPurchasedFeminineHygieneProducts',
'RecentlyPurchasedFolicAcidSupplements',
'RecentlyPurchasedPrenatalVitamins',
'RecentlyPurchasedPrenatalYogaDVD',
'RecentlyPurchasedBodyPillow',
'RecentlyPurchasedGingerAle',
'RecentlyPurchasedSeaBands',
'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
'RecentlyPurchasedCigarettes',
'RecentlyPurchasedSmokingCessationProducts',
'PurchasedWineRegularlyUntilRecentlyThenStopped',
'RecentlyPurchasedWine',
'RecentlyPurchasedMaternityClothing',
'IsPregnant',
], inplace = True) # drop values that contain NaN (or do not exist), except for gender 

In [4]:
df = df.replace(False, 0)
df = df.replace(True, 1)
df # replace T/F with 0s or 1s 

Unnamed: 0,AccountHolderImpliedGender,AccountHolderAddress,RecentlyPurchasedPregnancyTest,RecentlyPurchasedBirthControl,RecentlyPurchasedFeminineHygieneProducts,...,RecentlyPurchasedSmokingCessationProducts,PurchasedWineRegularlyUntilRecentlyThenStopped,RecentlyPurchasedWine,RecentlyPurchasedMaternityClothing,IsPregnant
0,Female,Apartment,0,1,0,...,0,0,0,0,False
1,Male,Apartment,0,0,1,...,0,0,1,0,False
2,Male,Apartment,0,0,0,...,0,0,0,0,False
3,Male,Apartment,0,0,0,...,0,0,1,0,False
4,Male,Apartment,0,0,1,...,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...
594,Female,Apartment,0,0,0,...,0,0,0,0,True
596,Female,Home,0,0,0,...,0,0,0,0,True
597,Female,Apartment,0,0,0,...,0,0,0,1,True
598,Female,Home,0,0,1,...,0,0,0,0,True


In [5]:
Address_df = pd.get_dummies(df.AccountHolderAddress, prefix = 'Address')
Gender_df = pd.get_dummies(df.AccountHolderImpliedGender, prefix = 'Gender', dummy_na = True) # do hot encoding for the variables which contain multiple values 

In [6]:
df = df.join([Address_df,Gender_df]) # join the hot encoded variables 

In [None]:
train_df = df.sample(frac = .5, random_state = 0) # did not use this block
test_df=df.drop(train_df.index)=

In [None]:
train_df.shape # did not use this block

In [None]:
df.describe()# did not use this block

In [None]:
# did not use this block
class LassoPolynomial(object):
    def __init__(self, alpha, n):
        self.features = preprocessing.PolynomialFeatures(n)
        self.model = linear_model.Lasso(alpha)

    def fit(self, X, y):
        self.X = X
        self.features.fit(X)

        X = self.features.transform(X)
        self.model.fit(X, y)

        return self

    def predict(self, X):
        X = self.features.transform(X)
        return self.model.predict(X)

    def score(self, X, y):
        X = self.features.transform(X)
        return self.model.score(X, y)

    def mean_squared_error(self, X, y):
        y_hat = self.predict(X)
        return metrics.mean_squared_error(y_hat, y)

    def complexity(self):
        return np.sum(np.abs(self.model.coef_))

In [None]:
# did not use this block
test_X = test_df[[
'RecentlyPurchasedPregnancyTest',
'RecentlyPurchasedBirthControl',
'RecentlyPurchasedFeminineHygieneProducts',
'RecentlyPurchasedFolicAcidSupplements',
'RecentlyPurchasedPrenatalVitamins',
'RecentlyPurchasedPrenatalYogaDVD',
'RecentlyPurchasedBodyPillow',
'RecentlyPurchasedGingerAle',
'RecentlyPurchasedSeaBands',
'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
'RecentlyPurchasedCigarettes',
'RecentlyPurchasedSmokingCessationProducts',
'PurchasedWineRegularlyUntilRecentlyThenStopped',
'RecentlyPurchasedWine',
'RecentlyPurchasedMaternityClothing',
'Address_Apartment',
'Address_Home',
'Address_PO Box',
'Gender_Female',
'Gender_Male',
'Gender_nan']]

test_X = sm.add_constant(test_X)

test_y = test_df.IsPregnant


In [None]:
# did not use this block
train_X = train_df[[
'RecentlyPurchasedPregnancyTest',
'RecentlyPurchasedBirthControl',
'RecentlyPurchasedFeminineHygieneProducts',
'RecentlyPurchasedFolicAcidSupplements',
'RecentlyPurchasedPrenatalVitamins',
'RecentlyPurchasedPrenatalYogaDVD',
'RecentlyPurchasedBodyPillow',
'RecentlyPurchasedGingerAle',
'RecentlyPurchasedSeaBands',
'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
'RecentlyPurchasedCigarettes',
'RecentlyPurchasedSmokingCessationProducts',
'PurchasedWineRegularlyUntilRecentlyThenStopped',
'RecentlyPurchasedWine',
'RecentlyPurchasedMaternityClothing',
'Address_Apartment',
'Address_Home',
'Address_PO Box',
'Gender_Female',
'Gender_Male',
'Gender_nan']]


train_X = sm.add_constant(train_X)

train_y = train_df.IsPregnant

alphas = [1e-1, 1e-2, 1e-3, 1e-5, 1e-7, 1e-9, 1e-11]

models = [LassoPolynomial(alpha, 17).fit(train_X, train_y) for alpha in alphas]

model_df = pd.DataFrame({'alpha': alphas, 'model': models})
model_df['R^2 (train)'] = model_df.model.apply(lambda model: model.score(train_X, train_y))
model_df['R^2 (test)'] = model_df.model.apply(lambda model: model.score(test_X, test_y))
model_df['Complexity'] = model_df.model.apply(lambda model: model.complexity())

model_df.drop('model', axis = 1).set_index('alpha')

models[-1].model.coef_

In [28]:
# split training and test data 
redo_train_df = df.sample(frac = .6, random_state = 0)
redo_test_df = df.drop(redo_train_df.index)

In [9]:
redo_train_df.columns # get all the features 

Index([u'AccountHolderImpliedGender', u'AccountHolderAddress',
       u'RecentlyPurchasedPregnancyTest', u'RecentlyPurchasedBirthControl',
       u'RecentlyPurchasedFeminineHygieneProducts',
       u'RecentlyPurchasedFolicAcidSupplements',
       u'RecentlyPurchasedPrenatalVitamins',
       u'RecentlyPurchasedPrenatalYogaDVD', u'RecentlyPurchasedBodyPillow',
       u'RecentlyPurchasedGingerAle', u'RecentlyPurchasedSeaBands',
       u'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedCigarettes',
       u'RecentlyPurchasedSmokingCessationProducts',
       u'PurchasedWineRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedWine', u'RecentlyPurchasedMaternityClothing',
       u'IsPregnant', u'Address_Apartment', u'Address_Home', u'Address_PO Box',
       u'Gender_Female', u'Gender_Male', u'Gender_nan'],
      dtype='object')

In [69]:
# create a list of features that we believe can be used to understand if household is pregnant 
names_X = ['RecentlyPurchasedBodyPillow','RecentlyPurchasedMaternityClothing', 'RecentlyPurchasedBirthControl', 'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
    'RecentlyPurchasedPrenatalYogaDVD', 'RecentlyPurchasedWine', 'Gender_Male', 'RecentlyPurchasedFeminineHygieneProducts',
     'Gender_nan','RecentlyPurchasedFolicAcidSupplements','RecentlyPurchasedCigarettes','RecentlyPurchasedPrenatalVitamins']

In [70]:
# separates the x varibles and y 
def X_c(df):
    X = df[ names_X ]
    c = df.IsPregnant
    return X, c

redo_train_X, redo_train_c = X_c(redo_train_df)
redo_test_X, redo_test_c = X_c(redo_test_df)

In [71]:
# runs the model 
model = linear_model.LogisticRegression().\
    fit(redo_train_X, redo_train_c)

In [72]:
print model.intercept_
print model.coef_

[ 0.12122817]
[[ 0.78609045  1.73942039 -1.95873236  0.81773687  0.15328004 -1.48152657
  -0.5316515  -1.78544548  0.08140966  2.51231569 -1.10824803  1.84650842]]


In [73]:
print 'training misclassification =', 1 - model.score(redo_train_X, redo_train_c)
print 'testing  misclassification =', 1 - model.score(redo_test_X, redo_test_c)

training misclassification = 0.198653198653
testing  misclassification = 0.237373737374


In [74]:
zip(names_X, np.exp(model.coef_[0]) - 1)

[('RecentlyPurchasedBodyPillow', 1.1947989629959679),
 ('RecentlyPurchasedMaternityClothing', 4.6940421344776535),
 ('RecentlyPurchasedBirthControl', -0.85896290760773963),
 ('PurchasedCigarettesRegularlyUntilRecentlyThenStopped', 1.2653672180736932),
 ('RecentlyPurchasedPrenatalYogaDVD', 0.16565136322814622),
 ('RecentlyPurchasedWine', -0.77270955051810453),
 ('Gender_Male', -0.41236630899998283),
 ('RecentlyPurchasedFeminineHygieneProducts', -0.83227767244810669),
 ('Gender_nan', 0.084815206646369656),
 ('RecentlyPurchasedFolicAcidSupplements', 11.333457495213722),
 ('RecentlyPurchasedCigarettes', -0.66986315438596922),
 ('RecentlyPurchasedPrenatalVitamins', 5.3376524561039078)]