In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm

In [41]:
# Reading the training set
Demographics_reduced = pd.read_csv('Demographics_reduced.csv')
Demographics_reduced.head(5)

print(Demographics_reduced.shape)

(16519, 9)


In [31]:
Demographics_reduced.describe()

Unnamed: 0,TotalChildren,YearlyIncome
count,16519.0,16519.0
mean,2.006235,78122.511532
std,1.682736,39710.985718
min,0.0,9482.0
25%,0.0,47807.5
50%,2.0,76120.0
75%,3.0,105194.5
max,5.0,196511.0


In [40]:
BikeBuyer_reduced = pd.read_csv('BikeBuyer_reduced.csv')
BikeBuyer_reduced.head(10)
BikeBuyer_reduced['Bikebuyer'][32]

nan

In [33]:
BikeBuyer_reduced.describe()
(BikeBuyer_reduced.astype(np.object) == 'nan').any() # ?,NaN, etc

Bikebuyer    False
dtype: bool

## Prepare data for scikit-learn model

Creating the labels for scikitlear

In [35]:
labels = BikeBuyer_reduced.values
print(labels[:50])
np.where(np.isnan(labels) == True)

[[ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [nan]
 [ 0.]
 [nan]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]


(array([   32,    34,   100,   126,   151,   187,   194,   251,   257,
          297,   390,   402,   445,   487,   656,   675,   703,   875,
          903,   928,   986,  1030,  1205,  1247,  1254,  1592,  1717,
         1734,  1905,  1923,  1928,  2057,  2078,  2138,  2188,  2359,
         2397,  2919,  2989,  3121,  3187,  3195,  3222,  3245,  3295,
         3302,  3316,  3393,  3744,  4039,  4057,  4082,  4113,  4128,
         4230,  4326,  4463,  4518,  4525,  4552,  4663,  4711,  4898,
         5135,  5303,  5316,  5508,  5546,  5823,  5865,  5928,  6143,
         6296,  6351,  6434,  6459,  6706,  6713,  6865,  6899,  6946,
         6963,  6993,  7154,  7198,  7221,  7353,  7385,  7493,  7503,
         7639,  8081,  8172,  8313,  8510,  8875,  9194,  9242,  9420,
        10510, 10544, 10684, 10768, 10789, 11196, 11557, 11721, 12303,
        12510, 12732, 12829, 13494, 13520, 13948, 15443]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0

In [22]:
# Define which are the categorical variables
cat_cols = [x  for x in Demographics_reduced.columns if pd.api.types.is_string_dtype(Demographics_reduced[x])]
print(cat_cols)

# Define which are the Numerical variables
numeric_cols = [x  for x in Demographics_reduced.columns if pd.api.types.is_numeric_dtype(Demographics_reduced[x])]
print(numeric_cols)

['Education', 'Occupation', 'Gender', 'MaritalStatus', 'AgeGroup', 'CarGroup', 'Children']
['TotalChildren', 'YearlyIncome']


create the numpy feature array or model matrix. As first step, the categorical variables need to be recoded as binary dummy variables. As discussed in another lesson this is a three step process:

Encode the categorical string variables as integers.
Transform the integer coded variables to dummy variables.
Append each dummy coded categorical variable to the model matrix.

In [6]:
def encode_string(cat_features):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

# Creating the first Feature

Features = encode_string(Demographics_reduced[cat_cols[0]])

for col in cat_cols[1::]:
    temp = encode_string((Demographics_reduced[col]))
    print('***********************************\n')
    print("%s has %s dummy variables"%(col,len(Demographics_reduced[col].unique())))
    print("These are the dummy variables %s"%(Demographics_reduced[col].unique()))
    Features = np.concatenate([Features, temp], axis = 1)

print(Features.shape)
print(Features[:2, :])   

***********************************

Occupation has 5 dummy variables
These are the dummy variables ['Professional' 'Management' 'Skilled Manual' 'Clerical' 'Manual']
***********************************

Gender has 2 dummy variables
These are the dummy variables ['M' 'F']
***********************************

MaritalStatus has 2 dummy variables
These are the dummy variables ['M' 'S']
***********************************

AgeGroup has 4 dummy variables
These are the dummy variables ['Group_2' 'Group_1' 'Group_3' 'Group_4']
***********************************

CarGroup has 3 dummy variables
These are the dummy variables ['No_car' '1-2' '3 or more']
***********************************

Children has 2 dummy variables
These are the dummy variables ['No_children' 'With_Children']
(16519, 23)
[[1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the On

Next the numeric features must be concatenated to the numpy array 

In [7]:
Features = np.concatenate([Features, np.array(Demographics_reduced[numeric_cols])], axis = 1)
print(Features.shape)
print(Features[:2, :])  

(16519, 25)
[[1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 1.00000e+00 0.00000e+00 2.00000e+00
  1.37947e+05]
 [1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 3.00000e+00
  1.01141e+05]]


### Create Test and Sample cases

In [8]:
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 500)
X_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

### Scaling 

In [9]:
n_numerical = len(numeric_cols)
scaler = preprocessing.StandardScaler().fit(X_train[:,-n_numerical:])
X_train[:,-n_numerical:] = scaler.transform(X_train[:,-n_numerical:])
X_test[:,-n_numerical:] = scaler.transform(X_test[:,-n_numerical:])
X_train[:2,]

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.78201105,  0.70143097],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        , -0.00312027, -0.35945194]])

## Construct the logistic regression model

In [19]:
np.where(np.isnan(y_train)== True)

(array([  200,   423,   477,   617,   751,   846,   896,  1359,  1493,
         1510,  1532,  1570,  1610,  1670,  1912,  2471,  2833,  3010,
         3152,  3363,  3858,  4021,  4202,  4820,  4866,  4939,  5011,
         5059,  5096,  5097,  5102,  5128,  5215,  5264,  5358,  5609,
         5628,  5675,  5700,  6123,  6169,  6442,  6474,  6561,  6628,
         6693,  6874,  6973,  7115,  7136,  7176,  7385,  7457,  7685,
         7861,  8148,  8548,  8811,  8860,  8948,  9019,  9318,  9512,
         9520,  9578,  9913,  9939, 10101, 10199, 10208, 10429, 10667,
        10719, 10738, 10803, 10813, 10919, 10947, 10978, 11085, 11214,
        11270, 11577, 11826, 11933, 12184, 12282, 12566, 12686, 12700,
        12927, 13218, 13242, 13287, 13697, 13976, 14044, 14154, 14210,
        14338, 14390, 14960, 14999, 15282, 15316, 15404, 15481, 15571,
        15583, 15791]),)

In [10]:
logistic_mod = linear_model.LogisticRegression() 
logistic_mod.fit(X_train, y_train)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').