# Week7 - Decision Tree Lab

* Train-test split
* Train a decison tree model
* Train a random forest model
* Evaluate the models
* Explain findings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')
y = df['left_company']
X = df.iloc[:, 1:]


**Changing the values of left_company column from Yes and No to 1 and 0.**

In [2]:
y = np.where(df['left_company'] == 'Yes', 1, 0)

## Train - test split

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

print(f'Training examples: {X_train.shape[0]:,}')
print(f'Test examples: {X_test.shape[0]:,}')

Training examples: 750
Test examples: 250


**Divide the numerical and categorical features.**

In [4]:
numerical_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['number']).columns.tolist()

# Print the numerical and categorical features
print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

Numerical Features: ['age', 'commuting_distance', 'education', 'satisfaction_with_environment', 'seniority_level', 'satisfaction_with_job', 'last_raise_pct', 'last_performance_rating', 'total_years_working', 'years_at_company', 'years_in_current_job', 'years_since_last_promotion', 'years_with_current_supervisor']
Categorical Features: ['frequency_of_travel', 'department', 'gender', 'position', 'married_or_single']


## Training decision tree model using Pipelines and Grid Search

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

processing_pipeline = ColumnTransformer(transformers=[
    ('numscaling', StandardScaler(), numerical_features),
    ('dummys', OneHotEncoder(drop='first'), categorical_features)]
)

modeling_pipeline = Pipeline([
    ('data_processing', processing_pipeline),
    ('dt', DecisionTreeClassifier())]
)

param_grid = [
  {'dt__max_depth': [2, 5, 10, 15, 20],
   'dt__min_samples_split':[3, 5, 10, 20, 40],
   'dt__min_samples_leaf': [2, 5],
   'dt__class_weight':[None]
  }
 ]

gcv_results = GridSearchCV(estimator=modeling_pipeline, 
                           param_grid=param_grid, scoring='accuracy', refit=True)
gcv_results = gcv_results.fit(X_train, y_train)

y_testp = gcv_results.predict(X_test)
y_testpr_lr = gcv_results.predict_proba(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_testp))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80       116
           1       0.84      0.79      0.82       134

    accuracy                           0.81       250
   macro avg       0.81      0.81      0.81       250
weighted avg       0.81      0.81      0.81       250



In [6]:
y_trainp = gcv_results.predict(X_train)
y_trainp_rf = gcv_results.predict_proba(X_train)

from sklearn.metrics import classification_report
print(classification_report(y_train, y_trainp))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       384
           1       0.98      0.94      0.96       366

    accuracy                           0.96       750
   macro avg       0.96      0.96      0.96       750
weighted avg       0.96      0.96      0.96       750



In [7]:
y_testp = gcv_results.predict(X_test)
y_testp_rf = gcv_results.predict_proba(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_testp))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80       116
           1       0.84      0.79      0.82       134

    accuracy                           0.81       250
   macro avg       0.81      0.81      0.81       250
weighted avg       0.81      0.81      0.81       250



## Training Random Forest model using Pipelines and Grid Search

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

processing_pipeline = ColumnTransformer(transformers=[
    ('numscaling', StandardScaler(), numerical_features),
    ('dummys', OneHotEncoder(drop='first'), categorical_features)]
)

rf_pipeline = Pipeline([
    ('data_processing', processing_pipeline),
    ('rf', RandomForestClassifier())]
)

param_grid = [{'rf__max_depth': [5, 8, 10, 12],
               'rf__n_estimators': [10, 50, 100],
               'rf__class_weight': [None, 'balanced', 'balanced_subsample'],
               'rf__max_samples': [100,200,300]
              }]

rf_results = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid, scoring='recall', refit=True)
rf_results = rf_results.fit(X_train, y_train)
rf_yhat = rf_results.predict(X_test)

rf_results.best_estimator_

In [9]:
y_trainp = rf_results.predict(X_train)
y_trainp_rf = rf_results.predict_proba(X_train)

from sklearn.metrics import classification_report
print(classification_report(y_train, y_trainp))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       384
           1       0.98      0.98      0.98       366

    accuracy                           0.98       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.98      0.98      0.98       750



In [10]:
y_testp = rf_results.predict(X_test)
y_testp_rf = rf_results.predict_proba(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_testp))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80       116
           1       0.84      0.81      0.82       134

    accuracy                           0.81       250
   macro avg       0.81      0.81      0.81       250
weighted avg       0.81      0.81      0.81       250



**From the above outputs of both the models, it is clear that random forest fits better for this data than decision tree because the training accuracy of random forest is 98 percent and that of testing accuracy is 81 whereas for decision tree, training accuracy is 96 and testing accuracy is 81. To my knowledge the random tree is the collection of decision trees together and the results are generated randomly, which means the values change often. Both of these models have training accuracy higher than testing accuracy which means that these models lead to overfitting. These overfitting can be overcomed by looking into the data, doing some feature extraction, dimensionality reduction, and so on.** 