In [649]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import os

In [650]:
# Features used in the regression model:
# 'CustomerID','Education','Occupation','Gender','MaritalStatus','NumberCarsOwned', 
# 'NumberChildrenAtHome','TotalChildren','HomeOwnerFlag','Age','IncomeNorm', 
# 'BikeBuyer', 'AvgMonthSpend']

In [651]:
# Read Customers Database
path = (r'C:\Home\Ser\MOOCs\MSDataProg\Project\AWCustomers\Datasets')
filename = 'CustomersDatabase002.csv'
pathfile= os.path.join(path,filename)  
df = pd.read_csv(pathfile, sep=',', index_col=0)

# Group 'NumberCarsOwned' into [0,1,>1] instead of [0,1,2,3,4,5]
df['NumberCarsOwned'].replace([2,3,4,5],'>1',inplace=True)

df.reset_index(inplace=True, drop=True) # CustomerID is removed here

# Remove features not to be used in the regression model, such as 'BikeBuyer'
# Remove features ranked low in the feature_importance output
dropcols = ['BikeBuyer','HomeOwnerFlag', 'NumberCarsOwned', 'TotalChildren']
df.drop(dropcols, axis=1, inplace=True)
data=df

In [652]:
# ID of selected customers to calculate their AvgMonthSpend
SelectedCustomers = [11908,12286,12892,14698,15085,15727,17418,17723,18644,19218,20671,20697,20734,
                     20812,23244,23454,24420,25193,25793,26410,26694,28096,28676,29015,29377]

In [653]:
# Read  dataset to estimate regression (AWTest-regression)
path = (r'C:\Home\Ser\MOOCs\MSDataProg\Project\AWCustomers\Datasets')
filename = 'CustomersTest002.csv'  
pathfile= os.path.join(path,filename)  
df_reg = pd.read_csv(pathfile, sep=',', index_col=0)
df_reg = df_reg.ix[SelectedCustomers]
df_reg.reset_index(inplace=True,drop=True)
dropcols = ['TotalChildren','HomeOwnerFlag', 'NumberCarsOwned']
df_reg.drop(dropcols, axis=1, inplace=True)
#df_reg

In [654]:
y_data = data['AvgMonthSpend'].copy()
data.drop(['AvgMonthSpend'], axis=1, inplace=True)

In [655]:
dummies=['Education',
         'Occupation',
         'Gender',
         'MaritalStatus',         
         'NumberChildrenAtHome']
         #'NumberCarsOwned' removed due to low importance
         #'HomeOwnerFlag', removed due to low importance
         #'TotalChildren', removed due to low importance
x_data = pd.get_dummies(data, columns=dummies)
x_reg = pd.get_dummies(df_reg, columns=dummies)

# Padd with Zeroes missing columns in x_reg
for col in x_data.columns:
    if col not in x_reg.columns:
        x_reg[col]=0 

In [656]:
# Select Data Normalization method with i
i=0
if i==0:   #65.44 
    scalername='None'
    xnorm= x_data  
elif i ==2: #65.45
    scaler = preprocessing.MinMaxScaler().fit(x_data)
    xnorm=scaler.transform(x_data)
elif i ==3: #65.44   
    scaler = preprocessing.StandardScaler().fit(x_data)
    xnorm=scaler.transform(x_data)
elif i==4: #65.45
    scaler = preprocessing.MaxAbsScaler().fit(x_data)
    xnorm=scaler.transform(x_data)

In [657]:
from sklearn.model_selection import train_test_split
(xtrain, xtest, ytrain, ytest) = train_test_split(xnorm, y_data, test_size=0.3, random_state=0)

In [658]:
# =DECISION TREE==========
from sklearn import tree
model = tree.DecisionTreeRegressor(max_depth=7, random_state=None,splitter='best')
model.fit(xtrain,ytrain)
featureimp=pd.DataFrame(model.feature_importances_,index=x_data.columns,columns=['rank'])
topfeatures= featureimp.sort_values('rank', ascending=False) 
DTscore=model.score(xtest,ytest)
DTpredict=model.predict(x_reg)

In [660]:
# Results

#print "RandomForest Score:{}".format(round(RFscore*100, 3))
print "Decision Tree Score:{}".format(round((DTscore*100),3))
print "Features Rank:{}".format(topfeatures)
#print "Random Forest Prediction:{}".format(RFpredict)
print "Decision Tree Prediction: \n{}".format(DTpredict)


Decision Tree Score:65.448
Features Rank:                                   rank
Age                            0.308195
Occupation_Manual              0.280719
Occupation_Skilled Manual      0.122171
Gender_F                       0.080301
Gender_M                       0.068211
NumberChildrenAtHome_0         0.051384
Occupation_Clerical            0.028239
MaritalStatus_M                0.021813
MaritalStatus_S                0.016184
IncomeNorm                     0.012999
NumberChildrenAtHome_1         0.003447
Occupation_Management          0.003363
Occupation_Professional        0.002077
Education_Bachelors            0.000296
Education_Partial College      0.000250
Education_Graduate Degree      0.000229
Education_High School          0.000122
Education_Partial High School  0.000000
NumberChildrenAtHome_2         0.000000
NumberChildrenAtHome_3         0.000000
Decision Tree Prediction: 
[ 49.05053333  49.05053333  49.05053333  50.31865854  48.36216216
  48.81103448  52.95290323