# Exploring Decision trees

---

## 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 2. Prepare data

In [2]:
#path to dataset
path='D://Downloads/Supervised Machine Learning/car_prices/titanic/train.csv'
#create dataframe
df=pd.read_csv(path)
#head of dataframe
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#brief description of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
#summary information for columns
print(f"num_cols\n{'-'*50}\n{df.describe().transpose()}\n\n")
print(f"text_cols\n{'-'*50}\n{df.describe(include='object').transpose()}")

num_cols
--------------------------------------------------
             count        mean         std   min       25%       50%    75%  \
PassengerId  891.0  446.000000  257.353842  1.00  223.5000  446.0000  668.5   
Survived     891.0    0.383838    0.486592  0.00    0.0000    0.0000    1.0   
Pclass       891.0    2.308642    0.836071  1.00    2.0000    3.0000    3.0   
Age          714.0   29.699118   14.526497  0.42   20.1250   28.0000   38.0   
SibSp        891.0    0.523008    1.102743  0.00    0.0000    0.0000    1.0   
Parch        891.0    0.381594    0.806057  0.00    0.0000    0.0000    0.0   
Fare         891.0   32.204208   49.693429  0.00    7.9104   14.4542   31.0   

                  max  
PassengerId  891.0000  
Survived       1.0000  
Pclass         3.0000  
Age           80.0000  
SibSp          8.0000  
Parch          6.0000  
Fare         512.3292  


text_cols
--------------------------------------------------
         count unique                      top freq


In [5]:
#create clean dataframe
df_clean=df.copy()
#fill missing values in Age and Embarked columns
mean=df_clean['Age'].mean()
mode=df_clean['Embarked'].mode()[0]
df_clean.replace({'Age':{np.nan: mean}, 'Embarked':{np.nan: mode}}, inplace=True)
#drop Cabin column
df_clean=df_clean.drop('Cabin', axis=1)

In [6]:
#head of df_clean
df_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [7]:
df_clean.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Embarked       False
dtype: bool

## 3. Create column transformer

In [8]:
#import scikit-learn classes
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
#lists of ordinal and nominal columns
cat_ordered=['Pclass']
cat_unordered=['Sex', 'Embarked']
#pipeline for ordinal columns
ordered_pipeline=make_pipeline(OrdinalEncoder())
#pipeline for nominal columns
unordered_pipeline=make_pipeline(OneHotEncoder())

preprocessor=ColumnTransformer([
    ('ordered_pipeline', ordered_pipeline, cat_ordered),
    ('unordered_pipeline', unordered_pipeline, cat_unordered),
], remainder='passthrough')


In [9]:
#transform columns
index=df_clean.columns.drop(['Survived','PassengerId', 'Ticket', 'Name'])
X=df_clean[index]
Y=df_clean['Survived']
X_preprocess=preprocessor.fit_transform(X)

## 4. Create models

In [130]:
#import required classes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

In [58]:
#split data into training and validation sets
X_train, X_val, y_train, y_val=train_test_split(X_preprocess, Y, test_size=0.3, random_state=0)

### Model 1

In [59]:
#train decision tree model
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [60]:
#print f1 score, precision and recall
y_pred=clf.predict(X_val)
f1=f1_score(y_val, y_pred)
precision=precision_score(y_val, y_pred)
recall=recall_score(y_val, y_pred)
print(f'Precision: {precision:.3f}\tRecall: {recall:.3f}\tf1 Score: {f1:.3f}')

Precision: 0.691	Recall: 0.650	f1 Score: 0.670


In [61]:
#see number of true and false positives and negatives
confusion_matrix(y_val, y_pred)

array([[139,  29],
       [ 35,  65]], dtype=int64)

In [None]:
#Check if model is overfitting or underfitting


### Model 2

In [68]:
#train model
forest=RandomForestClassifier(criterion='entropy')
forest.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy')

In [69]:
#print f1 score, precision and recall
y_pred=forest.predict(X_val)
f1=f1_score(y_val, y_pred)
precision=precision_score(y_val, y_pred)
recall=recall_score(y_val, y_pred)
print(f'Precision: {precision:.3f}\tRecall: {recall:.3f}\tf1 Score: {f1:.3f}')

Precision: 0.791	Recall: 0.720	f1 Score: 0.754


In [70]:
#see number of true and false positives and negatives
confusion_matrix(y_val, y_pred)

array([[149,  19],
       [ 28,  72]], dtype=int64)

### Model 3

In [134]:
#train model
logit=LogisticRegression(C=10, max_iter=10000, solver='saga')
logit.fit(X_train_, y_train)

LogisticRegression(C=10, max_iter=10000, solver='saga')

In [136]:
#print f1 score, precision and recall
y_pred=logit.predict(X_val)
f1=f1_score(y_val, y_pred)
precision=precision_score(y_val, y_pred)
recall=recall_score(y_val, y_pred)
print(f'Precision: {precision:.3f}\tRecall: {recall:.3f}\tf1 Score: {f1:.3f}')

Precision: 0.660	Recall: 0.350	f1 Score: 0.458


In [97]:
#see number of true and false positives and negatives
confusion_matrix(y_val, y_pred)

array([[143,  25],
       [ 28,  72]], dtype=int64)

In [101]:
params={'C':[0.01, 0.1, 1, 10, 100],
       'solver':['lbfgs', 'liblinear', 'newton-cg', 'saga', 'sag']}
cv=GridSearchCV(estimator=LogisticRegression(max_iter=10000, C=0.1),
                param_grid=params,
                scoring='f1'
               )
cv.fit(X_train, y_train)

GridSearchCV(estimator=LogisticRegression(C=0.1, max_iter=10000),
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'solver': ['lbfgs', 'liblinear', 'newton-cg', 'saga',
                                    'sag']},
             scoring='f1')

In [None]:
#results of gridsearch
cv.cv_results_

In [103]:
#best estimator
cv.best_estimator_

LogisticRegression(C=10, max_iter=10000, solver='saga')

In [131]:
X_train_poly=PolynomialFeatures(degree=2).fit_transform(X_train)
X_val_poly=PolynomialFeatures(degree=2).fit_transform(X_val)

In [133]:
X_train_poly.shape

(623, 66)

### Model 4

In [118]:
#train model
bag_logit=BaggingClassifier(LogisticRegression(C=10, max_iter=10000, solver='saga'), n_estimators=100)
bag_logit.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(C=10, max_iter=10000,
                                                    solver='saga'),
                  n_estimators=100)

In [119]:
#print f1 score, precision and recall
y_pred=bag_logit.predict(X_val)
f1=f1_score(y_val, y_pred)
precision=precision_score(y_val, y_pred)
recall=recall_score(y_val, y_pred)
print(f'Precision: {precision:.3f}\tRecall: {recall:.3f}\tf1 Score: {f1:.3f}')

Precision: 0.739	Recall: 0.680	f1 Score: 0.708


In [120]:
#see number of true and false positives and negatives
confusion_matrix(y_val, y_pred)

array([[144,  24],
       [ 32,  68]], dtype=int64)

### Model 5

In [122]:
#train moddel
voting=VotingClassifier([
    ('logit', bag_logit),
    ('forest', RandomForestClassifier(criterion='entropy'))
], voting='soft')
voting.fit(X_train, y_train)

VotingClassifier(estimators=[('logit',
                              BaggingClassifier(base_estimator=LogisticRegression(C=10,
                                                                                  max_iter=10000,
                                                                                  solver='saga'),
                                                n_estimators=100)),
                             ('forest',
                              RandomForestClassifier(criterion='entropy'))],
                 voting='soft')

In [127]:
#print f1 score, precision and recall
y_pred=voting.predict(X_val)
f1=f1_score(y_val, y_pred)
precision=precision_score(y_val, y_pred)
recall=recall_score(y_val, y_pred)
print(f'Precision: {precision:.3f}\tRecall: {recall:.3f}\tf1 Score: {f1:.3f}')

Precision: 0.795	Recall: 0.700	f1 Score: 0.745


In [126]:
#see number of true and false positives and negatives
confusion_matrix(y_val, y_pred)

array([[150,  18],
       [ 30,  70]], dtype=int64)