Ver 005, 17/06/2017. 
* RandomForest Included
* DataFrame with Failures included at bottom. Can be seen that models fail to preduct Male behaviour
* convert manually categorical features into numeric to avoid dummy


Ver 003, 16/06/2017. Pending: Cross-validation, compare with solution, compare train vs test scores

In [369]:
import pandas as pd
import numpy as np
import os

In [370]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [371]:
def diff(list1, list2):
    '''Get elements difference between two lists'''
    
    c = set(list1).union(set(list2))
    d = set(list1).intersection(set(list2))
    return list(c - d)

In [372]:
# Use only the following features from the Customers Database
customer_features = ['CustomerID', 'BirthDate', 'Education', 'Occupation', 
                     'Gender','MaritalStatus','HomeOwnerFlag','NumberCarsOwned',
                     'NumberChildrenAtHome','TotalChildren','YearlyIncome','LastUpdated']

# Use the following features from the Sales Database
sales_features = ['CustomerID', 'BikeBuyer', 'AvgmonthSpend']

# Use only the following features for model creation
analysis_features = ['Education',
                     'Occupation',
                     'Gender',
                     'MaritalStatus',
                     #'TotalChildren',
                     'IncomeNorm',
                     #'AgeGroup',
                     'Age',
                     #'YearlyIncome',
                     'NumberCarsOwned',
                     'NumberChildrenAtHome',
                     #'HomeOwnerFlag',
                    ]

# discard unnecesary features, diff between customer features and analysis features
#drop_features = diff(customer_features,analysis_features)

# define features to be treated as numeric and not dummy
numeric_features = ['IncomeNorm','Age']#, 'NumberCarsOwned', NumberChildrenAtHome', 'TotalChildren'

# Use the following features as dummy
dummy_features = diff(numeric_features, analysis_features)

In [373]:
def read_database():
    '''Read Customer and Sales Database'''
    
    path = (r'C:\Home00Ser\Python\MPPDS Project\Datasets')
    filename = 'AWCustomers.csv'
    pathfile= os.path.join(path,filename)  
    df_customers = pd.read_csv(pathfile, sep=',', header=0, usecols=customer_features)
    df_customers.drop_duplicates(['CustomerID'], keep='first', inplace=True)   
    
    filename = 'AWSales.csv'
    pathfile= os.path.join(path,filename)  
    df_sales = pd.read_csv(pathfile, sep=',', header=0, usecols=sales_features)    
    
    df = pd.merge(df_customers, df_sales, how='inner',on='CustomerID') #left_index=True, right_index=True)  
    
    #df.reset_index(inplace=True, drop=True)
    return df

In [374]:
def read_testfile():
    ''' Read  dataset to Classify (AWTest-classification) '''
    
    path = (r'C:\Home00Ser\Python\MPPDS Project\Datasets')
    filename = 'AWTest-Classification.csv'  
    pathfile= os.path.join(path,filename)  
    df_x = pd.read_csv(pathfile, sep=',', header=0, usecols=customer_features)  
 
    filename = 'AWTest-ClassificationLabels.csv'  
    pathfile= os.path.join(path,filename)  
    df_y = pd.read_csv(pathfile, sep=',', header=0, usecols=['CustomerID', 'BikeBuyer'])         
    
    df = pd.merge(df_x, df_y, how='inner',on='CustomerID') #left_index=True, right_index=True) 
       
    df.reset_index(inplace=True, drop=True)     
    return df

In [375]:
def read_solution():
    ''' Read  Classify solution (AWTest-ClassificationLabels) '''
    
    path = (r'C:\Home00Ser\Python\MPPDS Project\Datasets')
    filename = 'AWTest-ClassificationLabels.csv'  
    pathfile= os.path.join(path,filename)  
    df = pd.read_csv(pathfile, sep=',', header=0, usecols=['CustomerID', 'BikeBuyer'])  
    df.reset_index(inplace=True, drop=True)    
    return df

In [376]:
def data_preprocessing(df):
    '''
    Prepare data for Analysis. 
    Merge number of categories within a given feature
    Create derived columns such as IncomeNorm and AgeGroup
    '''
    # Group into [0,1,>1] instead of [0,1,2,3,4,5]
    # df['NumberCarsOwned'].replace([2,3,4,5],'>1',inplace=True)
        
    # Group into [0,>0] instead of [0,1,2,3]
    # df['NumberChildrenAtHome'].replace([1,2,3],'>0',inplace=True) 
    
    # Group into [0,12,3] instead of [0,1,2,3]
    # df['TotalChildren'].replace([1,2],'12', inplace=True)
    
    #Collapse YearlyIncome feature into one single income range from 0 to 12K (see capstone report)
    categories = ['Manual', 'Skilled Manual', 'Clerical', 'Management', 'Professional']
    i=1
    df['IncomeNorm']=0
    for category in categories:
        mask = df['Occupation']==category
        offset = i*25400
        df.loc[mask,'IncomeNorm'] = df.loc[mask,'YearlyIncome']- offset
        i +=1  
    
    # Transform BirthDate onto Age using LastUpdated feature as reference
    df['BirthDate'] = pd.to_datetime(df['BirthDate'])
    df['LastUpdated'] = pd.to_datetime(df['LastUpdated'])
    df['Age'] = df['LastUpdated'].dt.year - df['BirthDate'].dt.year     
    
    # create new feature. Group customers into AgeGroups by Age
    df['AgeGroup'] = 1  
    mask = df['Age']<= 20
    df.loc[mask, 'AgeGroup']= 1    
    mask = (df['Age'] >20) & (df['Age'] <= 25)
    df.loc[mask, 'AgeGroup']= 2
    mask = (df['Age'] >25) & (df['Age'] <= 30)
    df.loc[mask, 'AgeGroup']= 3        
    mask = (df['Age'] >30) & (df['Age'] <= 40)
    df.loc[mask, 'AgeGroup']= 4 
    mask = (df['Age'] >40) & (df['Age'] <= 50)
    df.loc[mask, 'AgeGroup']= 5        
    mask = (df['Age'] >50) & (df['Age'] <= 60)
    df.loc[mask, 'AgeGroup']= 6
    mask = (df['Age'] >60)
    df.loc[mask, 'AgeGroup']= 7        
    return df

In [377]:
def DTree(xtrain, ytrain, xtest, ytest, x_class):
    ''' DECISION TREE '''
    
    model = DecisionTreeClassifier(max_depth=8, random_state=None,criterion='gini',splitter='best')
    model.fit(xtrain,ytrain)
    featureimp=pd.DataFrame(model.feature_importances_,index=x_data.columns,columns=['rank'])
    topfeatures= featureimp.sort_values('rank', ascending=False) 
    Test_score=model.score(xtest,ytest)
    Train_score=model.score(xtrain,ytrain)
    Predict=model.predict(x_class)
    Proba=model.predict_proba(x_class)    
    return topfeatures, Test_score, Train_score, Predict

In [378]:
def RandForest(xtrain, ytrain, xtest, ytest, x_class):
    ''' RANDOMFOREST '''

    model = RandomForestClassifier(n_estimators=200,
                                   oob_score=True, 
                                   random_state=None, 
                                   max_depth=10)
    model.fit(xtrain, ytrain)
    Test_score=model.score(xtest,ytest)
    Train_score=model.score(xtrain,ytrain)
    Predict=model.predict(x_class)
    Proba=model.predict_proba(x_class)    
    return Test_score, Train_score, Predict    

In [379]:
# MAIN

# Create dataframe from customers database
data = read_database()
data = data_preprocessing(data)  
y_data = data['BikeBuyer'].copy()
dropcols = diff(analysis_features, data.columns)
data.drop(dropcols, axis=1, inplace=True)
x_data = pd.get_dummies(data, columns=dummy_features)

# Create dataframe from test file to classify
df_class = read_testfile()
df_class = data_preprocessing(df_class)
df_labels = df_class['BikeBuyer'].copy()
dropcols = diff(analysis_features, df_class.columns)
df_class.drop(dropcols, axis=1, inplace=True)
x_class = pd.get_dummies(df_class, columns=dummy_features)
# Padd with Zeros missing columns in x_class
for col in x_data.columns:
    if col not in x_class.columns:
        x_class[col]=0 
      

In [380]:
# Call to ML Algorithms

(xtrain, xtest, ytrain, ytest) = train_test_split(x_data, y_data, test_size=0.25, random_state=None)
DTfeat, DT_Test_score, DT_Train_score, DTpredict = DTree(xtrain, ytrain, xtest, ytest, x_class)
RF_Test_score, RF_Train_score, RFpredict = RandForest(xtrain, ytrain, xtest, ytest, x_class)

Col1 = DTpredict
Col2 = RFpredict
Col3 = df_labels
Col4 = abs(Col1 - Col3)
Col5 = abs(Col2 - Col3)

dictio = {'DT':Col1, 'RF':Col2, 'Sol': Col3, 'DTx': Col4, 'RFx':Col5 }
df = pd.DataFrame(dictio)
#df = df.reindex_axis(['P0','P1','AB','GB', 'RF', 'DT'], axis=1) # include Probabilities in df
df = df.reindex_axis(['DT','RF','Sol', 'DTx', 'RFx'], axis=1)

In [381]:
# RESULTS

print "Decision Tree Train Score:{}".format(round((DT_Train_score*100),3))
print "Decision Tree Test Score:{}".format(round((DT_Test_score*100),3))

Decision Tree Train Score:80.822
Decision Tree Test Score:79.255


In [382]:
print "Random Forest Train Score:{}".format(round((RF_Train_score*100),3))
print "Random Forest Test Score:{}".format(round((RF_Test_score*100),3))

Random Forest Train Score:81.919
Random Forest Test Score:78.993


In [383]:
# RESULTS
print "Features Rank:{}".format(DTfeat)

Features Rank:                                   rank
NumberChildrenAtHome_0         0.304544
NumberCarsOwned_1              0.199395
Occupation_Manual              0.151759
IncomeNorm                     0.107082
Age                            0.053529
Education_Partial High School  0.049828
MaritalStatus_S                0.035557
Gender_F                       0.031685
NumberCarsOwned_2              0.030061
Gender_M                       0.019031
MaritalStatus_M                0.007258
Occupation_Skilled Manual      0.003273
Occupation_Management          0.001790
NumberChildrenAtHome_1         0.001189
Education_High School          0.001046
Education_Graduate Degree      0.000845
NumberCarsOwned_4              0.000741
Education_Partial College      0.000697
NumberCarsOwned_3              0.000580
Occupation_Professional        0.000112
NumberCarsOwned_5              0.000000
NumberChildrenAtHome_2         0.000000
NumberChildrenAtHome_3         0.000000
Education_Bachelors       

In [384]:
df

Unnamed: 0,DT,RF,Sol,DTx,RFx
0,1,1,1,0,0
1,1,1,1,0,0
2,1,1,0,1,1
3,1,1,0,1,1
4,1,1,0,1,1
5,1,1,1,0,0
6,1,1,0,1,1
7,1,1,1,0,0
8,1,1,1,0,0
9,1,1,1,0,0


In [385]:
# Results
prediction = (50-df.sum()[3])/50.
print "DT Real Score:{}".format(round((prediction*100),3))

DT Real Score:66.0


In [386]:
# Results
prediction = (50-df.sum()[4])/50.
print "RF Real Score:{}".format(round((prediction*100),3))

RF Real Score:68.0


In [387]:
#data

In [388]:
print('instances where prediction fails on DT')
mask = df['DTx']==True
df_class[mask]


instances where prediction fails on DT


Unnamed: 0,Education,Occupation,Gender,MaritalStatus,NumberCarsOwned,NumberChildrenAtHome,IncomeNorm,Age
2,Partial College,Clerical,M,M,3,0,7413,57
3,Graduate Degree,Professional,F,S,2,0,3551,28
4,Graduate Degree,Professional,M,S,2,0,8915,31
6,Graduate Degree,Clerical,F,M,2,0,2034,36
11,High School,Manual,M,S,1,0,10785,31
14,Bachelors,Clerical,F,M,2,2,2141,54
18,Partial College,Clerical,F,M,3,0,9599,36
19,High School,Clerical,M,S,1,0,11499,21
21,High School,Skilled Manual,F,S,2,0,548,42
24,Bachelors,Management,F,M,3,0,2440,24


In [389]:
print('instances where prediction fails on RF')
mask = df['DTx']==True
df_class[mask]

instances where prediction fails on RF


Unnamed: 0,Education,Occupation,Gender,MaritalStatus,NumberCarsOwned,NumberChildrenAtHome,IncomeNorm,Age
2,Partial College,Clerical,M,M,3,0,7413,57
3,Graduate Degree,Professional,F,S,2,0,3551,28
4,Graduate Degree,Professional,M,S,2,0,8915,31
6,Graduate Degree,Clerical,F,M,2,0,2034,36
11,High School,Manual,M,S,1,0,10785,31
14,Bachelors,Clerical,F,M,2,2,2141,54
18,Partial College,Clerical,F,M,3,0,9599,36
19,High School,Clerical,M,S,1,0,11499,21
21,High School,Skilled Manual,F,S,2,0,548,42
24,Bachelors,Management,F,M,3,0,2440,24
