In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
train_age = np.where(df_train.Age.isnull(), df_train.Age.mean(), df_train.Age)
test_age = np.where(df_test.Age.isnull(), df_test.Age.mean(), df_test.Age)

In [5]:
label_en = preprocessing.LabelEncoder()

In [6]:
train_gender = label_en.fit_transform(df_train.Sex)
test_gender = label_en.fit_transform(df_test.Sex)

In [7]:
features = pd.DataFrame([train_age, df_train.Fare, train_gender]).T

In [8]:
model1 = tree.DecisionTreeClassifier(max_depth = 6)
model1.fit(X = features, y = df_train.Survived)

DecisionTreeClassifier(max_depth=6)

In [9]:
model1.score(X = features, y = df_train.Survived)

0.8256467941507312

In [10]:
features_test = pd.DataFrame([test_age, df_test.Fare, test_gender]).T

In [11]:
pred = model1.predict(X = features_test)

In [12]:
pred_output = pd.DataFrame({"P_Id": df_test.PassengerId, "Survived" : pred})

In [13]:
pred_output.head()

Unnamed: 0,P_Id,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [14]:
with open('DecisionTree1.dot','w') as file:
    file = tree.export_graphviz(model1, feature_names=['AGE','FARE','GENDER'], out_file= file)

# Project 2: Build Decision Tree for Attrition

In [17]:
df = pd.read_csv('general_data.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [18]:
df['NumCompaniesWorked'] = np.where(df.NumCompaniesWorked.isnull(), df.NumCompaniesWorked.mean(), df.NumCompaniesWorked )
df['TotalWorkingYears'] = np.where(df.TotalWorkingYears.isnull(), df.TotalWorkingYears.mean(), df.TotalWorkingYears )

In [19]:
le = preprocessing.LabelEncoder()

In [20]:
df_temp = df.copy()

In [21]:
df_temp['Attrition'] = le.fit_transform(df.Attrition)
df_temp['BusinessTravel'] = le.fit_transform(df.BusinessTravel)
df_temp['Department'] = le.fit_transform(df.Department)
df_temp['EducationField'] = le.fit_transform(df.EducationField)
df_temp['Gender'] = le.fit_transform(df.Gender)
df_temp['JobRole'] = le.fit_transform(df.JobRole)
df_temp['MaritalStatus'] = le.fit_transform(df.MaritalStatus)
df_temp['Over18'] = le.fit_transform(df.Over18)

In [22]:
model2 = RandomForestClassifier(n_estimators = 2000, max_features = 2, oob_score = True)

In [23]:
attr_train_feats = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 
                'Gender', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 
                'Over18', 'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 
                'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [24]:
model2.fit(X = df_temp[attr_train_feats], y = df_temp.Attrition)

RandomForestClassifier(max_features=2, n_estimators=2000, oob_score=True)

In [25]:
model2.oob_score_

1.0

In [26]:
for train_feat, imp in zip(attr_train_feats, model2.feature_importances_):
    print(train_feat, '-->', imp)

Age --> 0.09689470120121112
BusinessTravel --> 0.02816396531492494
Department --> 0.026073968608441567
DistanceFromHome --> 0.06969389030184246
Education --> 0.04070425204062446
EducationField --> 0.04125477487410765
Gender --> 0.018055215365472402
JobLevel --> 0.03781892949576132
JobRole --> 0.055826394759764544
MaritalStatus --> 0.039229592919796184
MonthlyIncome --> 0.09299361158582269
NumCompaniesWorked --> 0.05656847422964905
Over18 --> 0.0
PercentSalaryHike --> 0.0658613451422663
StockOptionLevel --> 0.03406115693617814
TotalWorkingYears --> 0.0851337182612995
TrainingTimesLastYear --> 0.044896773206183065
YearsAtCompany --> 0.06968021011784974
YearsSinceLastPromotion --> 0.04319792009557654
YearsWithCurrManager --> 0.053891105543228374


In [27]:
attr_test_feat = ['Age','DistanceFromHome', 'MonthlyIncome', 'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany']

In [28]:
model3 = tree.DecisionTreeClassifier(max_depth = 12)

In [29]:
model3.fit(X = df_temp[attr_test_feat], y = df_temp.Attrition)

DecisionTreeClassifier(max_depth=12)

In [30]:
model3.score(X = df_temp[attr_test_feat], y = df_temp.Attrition)

0.9480725623582766

# Build Decisoin Tree for Bank Loan Modelling

In [35]:
df3 = pd.read_excel('bank_loan.xlsx', sheet_name='Data')
df3.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [36]:
model4 = RandomForestClassifier(n_estimators = 2000, max_features = 2 , oob_score = True)

In [37]:
bank_train_feats =[ 'Age', 'Experience', 'Income', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Securities Account',
       'CD Account', 'Online', 'CreditCard']

In [38]:
model4.fit(X = df3[bank_train_feats], y = df3['Personal Loan'])

RandomForestClassifier(max_features=2, n_estimators=2000, oob_score=True)

In [39]:
model4.oob_score_

0.9872

In [40]:
for train_feat, imp in zip(bank_train_feats, model4.feature_importances_):
    print(train_feat, '-->', imp)

Age --> 0.0446626235395599
Experience --> 0.04392147578321907
Income --> 0.3418891306526585
Family --> 0.09631919313771431
CCAvg --> 0.18501903492266214
Education --> 0.165659680549485
Mortgage --> 0.04383614721992585
Securities Account --> 0.0054787259219068
CD Account --> 0.05487524880606597
Online --> 0.00840701134637976
CreditCard --> 0.009931728120422572


In [41]:
bank_test_feats = ['Family', 'CCAvg', 'Education', 'CD Account','Age','Mortgage']

In [42]:
model5 = tree.DecisionTreeClassifier(max_depth = 12)

In [43]:
model5.fit(X = df3[bank_test_feats], y = df3['Personal Loan'])

DecisionTreeClassifier(max_depth=12)

In [44]:
model5.score(X = df3[bank_test_feats], y = df3['Personal Loan'])

0.9868