In [9]:
# Import our libraries

# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization
import seaborn as sns
sns.set()
# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Helper function for hyper-parameter turning.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Import our Random Forest
from sklearn.ensemble import RandomForestClassifier
# Use inline so our visualizations display in notebook
%matplotlib inline
#testing model import
from sklearn.model_selection import train_test_split

from sklearn.tree import plot_tree, export_text

## Main Steps when building a Machine Learning Model. 
1. Inspect and explore data.
2. Select and engineer features.
3. Build and train model.
4. Evaluate model.

# #1 Inspect and explore data.
* Load titanic data
* Visualize all the data using sns.pairplot
* Check for null values

In [10]:
# Load in the titanic data set.
titanic_data_path = 'data/titanic.csv'
df = pd.read_csv(titanic_data_path)
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
# Visualize all the data using sns.pairplot
sns.pairplot(df);

In [12]:
# Check for null values
df.isnull().sum()

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

# #2 Select and engineer features.
1. Fill age null values with -999
1. Convert to numerical values if need be by using `pd.get_dummies()`
1. Create a list of the features you are going to use.  In this case use as many or as little as you would like.
1. Define our `X` and `y`
1. Split our data into trainig and testing sets.

In [13]:
# Fill age null values with -999
df['age'] = df['age'].fillna(-999)

In [14]:
# 1. Convert to numerical values if need be by using `pd.get_dummies()`
df = pd.get_dummies(df, columns=['sex', 'pclass', 'embarked'], drop_first=True)

In [15]:
# 2. Create a list of the features we are going to use.
selected_features = ['age','pclass_2','pclass_3']
target_variable = 'survived'

In [16]:
# Define our `X` and `y`
x = df[selected_features]
y = df[target_variable]



In [17]:
# Split our data into trainig and testing sets. random_state is just an arbitrary number, its a seed, 45 will always produce the same randomization and etc
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=45)

print('Lenght of our Training data:', x_train.shape[0], '\nLength of our Testing data:', x_test.shape[0])

Lenght of our Training data: 712 
Length of our Testing data: 179


# #3 Build and train model.
1. For our first pass, initialize our model with `max_depth=2`.
2. Fit our model with our training data. 
3. Make predictions of our testing data. 
4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
    * To calculate auc score you have to get the predicted probabilites for the Survived class using `model.predict_proba(X_test)[:,1]`
5. Visualize our Decision Tree using provided code. 


In [18]:
# For our first pass, initialize our model with `max_depth=2`.

model = DecisionTreeClassifier(max_depth=2)

In [19]:
# Fit our model with our training data.
model.fit(x_train,y_train)


In [20]:
# Make predictions of our testing data.
y_pred = model.predict(x_test)

In [21]:
# 4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores.
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = model.predict_proba(x_test)

# Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

# Compute auc score
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)
print(classification_report(y_test, y_pred))

Accuracy Score: 0.703911
Precision Score: 0.621622
Recall Score: 0.370968
F1 Score: 0.464646
AUC Score: 0.752826
              precision    recall  f1-score   support

           0       0.73      0.88      0.80       117
           1       0.62      0.37      0.46        62

    accuracy                           0.70       179
   macro avg       0.67      0.63      0.63       179
weighted avg       0.69      0.70      0.68       179



# Visualize your tree

In [22]:
# Plot Tree
class_names=['Died', 'Survived']
plot_tree(model, feature_names=selected_features, class_names=class_names, filled=True);

# Picking the right parameters...

# Parameter tuning of your Decision Tree using GridSearch or RandomizedSearch

### For assistance on this, look at Steves TA Tips code in `TA-Tips/random_forest_tuning.ipynb`


1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Visualize your best tree.
1. Which feature was your most important feature?

```python
tree.DecisionTreeClassifier(
    *,
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0,
)
```


[Tips on how to customize / set the paramters in the decision tree.](https://scikit-learn.org/stable/modules/tree.html#tips-on-practical-use)

In [23]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search.from sklearn.model_selection import GridSearchCV
params = { 
    'max_depth': [None, 5, 10, 15],            # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split a node
    'criterion': ['gini', 'entropy']           # Function to measure the quality of a split
}

In [24]:
# 1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
dt_model = DecisionTreeClassifier()
grid_search_cv =  GridSearchCV(estimator=dt_model, param_grid=params, cv=5, scoring='accuracy')

In [28]:
# 1. Fit your GridSearchCV with your training data. 
grid_search_cv.fit(x_train, y_train)

Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5}


In [29]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
# Best parameter combination
print("Best Parameters:", grid_search_cv.best_params_)

Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5}


In [30]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you the best tree
model = grid_search_cv.best_estimator_

# Now lets evaluate our model
y_pred = y_pred = model.predict(x_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = model.predict_proba(x_test)[:,1]

# Compute auc score
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)
print(classification_report(y_test, y_pred))

Accuracy Score: 0.726257
Precision Score: 0.696970
Recall Score: 0.370968
F1 Score: 0.484211
AUC Score: 0.734905
              precision    recall  f1-score   support

           0       0.73      0.91      0.81       117
           1       0.70      0.37      0.48        62

    accuracy                           0.73       179
   macro avg       0.71      0.64      0.65       179
weighted avg       0.72      0.73      0.70       179



In [31]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.DataFrame.from_dict( {'feature_importance': model.feature_importances_,
                                       'feature':selected_features }).sort_values('feature_importance', ascending=False)
feature_imp

Unnamed: 0,feature_importance,feature
2,0.46235,pclass_3
0,0.4527,age
1,0.084949,pclass_2


# Now onto Random Forests...
Were going to do the same with, but this time with a random forest. Remeber... Repetition is the father of learning.

1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Which feature was your most important feature?


# Parameters of the Random Forest Classifier

```python
RandomForestClassifier(
    n_estimators=100,
    *,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)
```

In [32]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
params = {
    'n_estimators': [50, 100, 150],          # Number of trees in the forest
    'max_depth': [None, 10, 20],             # Maximum depth of the trees
    'min_samples_split': [2, 5, 10]          # Minimum number of samples required to split a node
}

In [35]:
# 1. Initalize your GridSearchCV or RandomizedSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.

rf_model = RandomForestClassifier()
grid_search_cv = GridSearchCV(estimator=rf_model, param_grid=params, cv=5, scoring='accuracy')

In [36]:
# 1. Fit your GridSearchCV with your training data. 
grid_search_cv.fit(x_train, y_train)

In [37]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print("Best Parameters:", grid_search_cv.best_params_)




Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}


In [38]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you tree that has the highest f1-score. 
model = grid_search_cv.best_estimator_


# Now lets evaluate our model
y_pred = y_pred = model.predict(x_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = model.predict_proba(x_test)[:,1]

# Compute auc score
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)
print(classification_report(y_test, y_pred))

Accuracy Score: 0.726257
Precision Score: 0.644444
Recall Score: 0.467742
F1 Score: 0.542056
AUC Score: 0.758754
              precision    recall  f1-score   support

           0       0.75      0.86      0.80       117
           1       0.64      0.47      0.54        62

    accuracy                           0.73       179
   macro avg       0.70      0.67      0.67       179
weighted avg       0.72      0.73      0.71       179



In [39]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.Series(model.feature_importances_,index=selected_features).sort_values(ascending=False)
feature_imp

age         0.697481
pclass_3    0.264909
pclass_2    0.037610
dtype: float64

# Build a random forest using the ny-vs-sf-housing.csv data. 
* Your target variable, aka the column you are trying to predict, aka your `y` variable is `in_sf`. 
* Can you get an accuracy above %88.8889?
* What was your most important feature?


In [41]:
df = pd.read_csv('data/ny-vs-sf-houses.csv')
df.head()

Unnamed: 0,in_sf,beds,bath,price,year_built,sqft,price_per_sqft,elevation
0,0,2.0,1.0,999000,1960,1000,999,10
1,0,2.0,2.0,2750000,2006,1418,1939,0
2,0,2.0,2.0,1350000,1900,2150,628,9
3,0,1.0,1.0,629000,1903,500,1258,9
4,0,0.0,1.0,439000,1930,500,878,10


In [55]:
# BUILD, TRAIN, AND EVAULATE A RANDOM FOREST MODEL BELOW. 
#df.isnull().sum() #no null values apparently
features = ['year_built','sqft','price_per_sqft','elevation']
target = ['in_sf']
x = df[features]
y = df[target].values.ravel()

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=45)

params = {
    'n_estimators': [100, 200, 300, 500],          # Number of trees in the forest
    'max_depth': [None, 5, 10, 15,20],             # Maximum depth of the trees
    'min_samples_split': [10, 15, 20, 30]          # Minimum number of samples required to split a node
}


model = RandomForestClassifier()
grid_search_cv = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy')


model.fit(x_train,y_train)
grid_search_cv.fit(x_train, y_train)
print("Best Parameters:", grid_search_cv.best_params_)
# Use the best model from grid search
best_model = grid_search_cv.best_estimator_

# Make predictions and evaluate on test data
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best Model Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = best_model.predict_proba(x_test)

# Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

# Compute auc score
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)
print(classification_report(y_test, y_pred))


Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
Best Model Accuracy Score: 0.909091
Precision Score: 0.940000
Recall Score: 0.886792
F1 Score: 0.912621
AUC Score: 0.957752
              precision    recall  f1-score   support

           0       0.88      0.93      0.91        46
           1       0.94      0.89      0.91        53

    accuracy                           0.91        99
   macro avg       0.91      0.91      0.91        99
weighted avg       0.91      0.91      0.91        99



# Awesome difficult extra credit below:
Build a classifier using the adult_income.csv data.  
* The target variable is 'class'
* Start with just using these features `selected_features = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']`
* You have to include the pos_label in your precision, recall, and f1 scores. It just tells the classifier which one is the posotive label.  I provided the proper way below.

* See if you can get above 50% f1 score.  
* See some [super tricks and tips here](https://www.kaggle.com/code/jieyima/income-classification-model)

In [None]:
df = pd.read_csv('data/adult_income.csv')
df.head()