# Random Forests Classifier

Will use previously used loan data here for the model with comparing old SVC and Logisitic Regression Model

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics
import math

In [9]:
df = pd.read_csv("loan_data.csv")

# cleanup and category conversion

df.person_gender =[1 if value == "male" else 0 for value in df.person_gender]
# ordinal categories
category_mapper = {'Doctorate': 4, 'Master': 3, 'Bachelor': 2, 'Associate': 1, 'High School': 0}
df['person_education'] = df['person_education'].map(category_mapper)

# convert binary categories
df.previous_loan_defaults_on_file =[1 if value == "Yes" else 0 for value in df.previous_loan_defaults_on_file]

# One-hot encode nominal variables
from sklearn.preprocessing import OneHotEncoder

variables = ['person_home_ownership', 'loan_intent']

# use encoder
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,...,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,22.0,0,3,71948.0,0,35000.0,16.02,0.49,3.0,561,...,0,0,0,1,0,0,0,0,1,0
1,21.0,0,0,12282.0,0,1000.0,11.14,0.08,2.0,504,...,0,0,1,0,0,1,0,0,0,0
2,25.0,0,0,12438.0,3,5500.0,12.87,0.44,3.0,635,...,1,0,0,0,0,0,0,1,0,0
3,23.0,0,2,79753.0,0,35000.0,15.23,0.44,2.0,675,...,0,0,0,1,0,0,0,1,0,0
4,24.0,1,3,66135.0,1,35000.0,14.27,0.53,4.0,586,...,0,0,0,1,0,0,0,1,0,0


In [17]:
# logistic regression model

# Prepare the model
y = df["loan_status"] # our target variable
X = df.drop(["loan_status"], axis=1) # our predictors

from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)

from sklearn.linear_model import LogisticRegression

# Create logistic regression model
lr = LogisticRegression()

# Train the model on the training data
lr.fit(X_train, y_train)

# Predict the target variable on the test data
y_pred = lr.predict(X_test)

# print the classification report based on true values and predictions
print(classification_report(y_test, y_pred))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, y_pred)
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")

              precision    recall  f1-score   support

           0       0.93      0.94      0.93     10493
           1       0.77      0.75      0.76      3007

    accuracy                           0.89     13500
   macro avg       0.85      0.84      0.85     13500
weighted avg       0.89      0.89      0.89     13500


Model overall accuracy: 89.43%




In [16]:
# SVC Model
model = make_pipeline(StandardScaler(), svm.SVC(probability=True))
model.fit(X_train, y_train)
# Get test predictions for evaluation metrics
predictions = model.predict(X_test)

# Classification metrics
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")


              precision    recall  f1-score   support

           0       0.93      0.96      0.95     10493
           1       0.85      0.76      0.80      3007

    accuracy                           0.91     13500
   macro avg       0.89      0.86      0.87     13500
weighted avg       0.91      0.91      0.91     13500


Model overall accuracy: 91.50%




# Random Forest Classifier

In [18]:
# classifier
from sklearn.ensemble import RandomForestClassifier

In [19]:
# classification
model = make_pipeline(StandardScaler(), RandomForestClassifier())
model.fit(X_train, y_train)

In [20]:
# Get test predictions for evaluation metrics
predictions = model.predict(X_test)

# Classification metrics
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")

              precision    recall  f1-score   support

           0       0.94      0.97      0.95     10493
           1       0.89      0.77      0.82      3007

    accuracy                           0.93     13500
   macro avg       0.91      0.87      0.89     13500
weighted avg       0.93      0.93      0.93     13500


Model overall accuracy: 92.70%




In [21]:
# trying with different parameters

from sklearn.model_selection import GridSearchCV

#using minimal paraameters to search

param_grid = {'n_estimators': [100, 200], 'class_weight': [None, 'balanced'], 'ccp_alpha': [0.0, 0.01]}

grid = GridSearchCV( RandomForestClassifier(random_state=42),param_grid,scoring='accuracy',verbose=2,n_jobs=-1)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=100; total time=   5.1s
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=100; total time=   5.2s
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=100; total time=   5.3s
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=100; total time=   5.4s
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=100; total time=   5.5s
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=200; total time=   9.9s
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=200; total time=  10.0s
[CV] END .ccp_alpha=0.0, class_weight=None, n_estimators=200; total time=  10.1s
[CV] END ccp_alpha=0.0, class_weight=balanced, n_estimators=100; total time=   4.7s
[CV] END ccp_alpha=0.0, class_weight=balanced, n_estimators=100; total time=   4.7s
[CV] END ccp_alpha=0.0, class_weight=balanced, n_estimators=100; total time=   4.7s
[CV] END .ccp_alpha=0.0, class_weight=No

In [22]:
print(grid.best_params_)

{'ccp_alpha': 0.0, 'class_weight': None, 'n_estimators': 200}


In [23]:
# classification
model = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=200,class_weight=None,ccp_alpha=0.0,random_state=42))
model.fit(X_train, y_train)

In [24]:
# Get test predictions for evaluation metrics
predictions = model.predict(X_test)

# Classification metrics
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")

              precision    recall  f1-score   support

           0       0.94      0.97      0.95     10493
           1       0.89      0.77      0.83      3007

    accuracy                           0.93     13500
   macro avg       0.91      0.87      0.89     13500
weighted avg       0.93      0.93      0.93     13500


Model overall accuracy: 92.80%




Got slightly better model and I think GridSearchCV doesn't make that much of difference here

# Model Comparison:
https://tabletomarkdown.com/convert-spreadsheet-to-markdown/

| Model               | Accuracy | Precision (Class 1) | Recall (Class 1) | F1-Score (Class 1) |
| ------------------- | -------- | ------------------- | ---------------- | ------------------ |
| Logistic Regression | 91.50%   | 0.85                | 0.76             | 0.80               |
| SVC (RBF Kernel)    | 92.70%   | 0.89                | 0.77             | 0.82               |
| Random Forest       | 92.80%   | 0.89                | 0.77             | 0.83               |


# Analysis:

Random Forest had the highest overall accuracy (92.8%) and best F1-score for Class 1, making it the best performer overall. SVC came very close here, especially in precision and balanced class performance and it was slightly better than Logistic Regression. Logistic Regression, while fast and interpretable, had lower recall for class 1 here.


# Conclusion:

Random Forest is the most balanced and accurate on this dataset, maybe because it handles non-linear patterns and interactions well.


# Advanced task


## Different Classification Dataset

Will use similar dataset that I've used earlier like from breast_cancer cancer dataset from sklearn.

https://how.dev/answers/what-is-sklearndatasetsloadbreastcancer-in-python

In [25]:
from sklearn.datasets import load_breast_cancer

# getting the data and load dataset
cancer = load_breast_cancer()
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [26]:
cancer_df = pd.DataFrame(cancer.data, columns = cancer.feature_names)
cancer_df['target'] = cancer.target
cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [27]:
# checking the features and target
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [28]:
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [29]:
cancer_df.target.value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [30]:
# train-test split

X = cancer_df.drop('target', axis=1)
y = cancer_df.target

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [31]:
# classification
cancer_model = make_pipeline(StandardScaler(), RandomForestClassifier())
cancer_model.fit(X_train, y_train)

In [32]:
# Get test predictions for evaluation metrics
predictions = cancer_model.predict(X_test)

# Classification metrics
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")

              precision    recall  f1-score   support

           0       0.96      0.94      0.95        54
           1       0.97      0.98      0.97        89

    accuracy                           0.97       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.97      0.97      0.96       143


Model overall accuracy: 96.50%




### Conclusion:

Appliyed Random Forest to the breast cancer dataset to classify tumors as malignant or benign. This dataset has numeric features like texture, area, and smoothness of cell nuclei. The model achieved high accuracy 96.50% , showing Random Forest is effective model in classification tasks. 

Compared to the loan dataset, this one was cleaner and more balanced, which helped the model generalize well without tuning more.

## Random Forest Regressor

Will use here insurance_dataset for the regression model that used in exercise 1

In [35]:
# load the data and cleaning process with categories convert and all
csv_path = "insurance.csv"
df_2 = pd.read_csv(csv_path)

# Using Dictionary Mapping for the smoker and sex columns
df_2['smoker'] = df_2['smoker'].map({"yes":1, "no":0})
df_2['sex'] = df_2['sex'].map({"male":1, "female":0})

# importing OneHotEncoder for coverting the categorical values for regions.
from sklearn.preprocessing import OneHotEncoder
variables = ['region']

# use encoder
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encodded= encoder.fit_transform(df_2[variables]).astype(int)
df_2= pd.concat([df_2, one_hot_encodded], axis=1).drop(columns=variables)

In [36]:
# First we do split between X and y.
X = df_2.drop("charges", axis=1)

# our target variable is y
y = df_2["charges"]

In [37]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [44]:
# Linear regression model.

from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:',metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:',math.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R-Squared:', metrics.r2_score(y_test, predictions))

Mean Absolute Error: 4145.4505556275935
Mean Squared Error: 33780509.57479168
Root Mean Squared Error: 5812.100272258874
R-Squared: 0.7696118054369008


In [45]:
# SVR Model

# Feature scaling is essential for SVR!
sc_X = StandardScaler()
sc_y = StandardScaler()

X_train_scaled = sc_X.fit_transform(X_train)
X_test_scaled = sc_X.transform(X_test)

y_train_scaled = sc_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = sc_y.fit_transform(y_test.values.reshape(-1, 1)).ravel()

# connect standard scaler for X-values
model = make_pipeline(StandardScaler(), svm.SVR())
model.fit(X_train_scaled, y_train_scaled)

predictions_scaled = model.predict(X_test_scaled)

# inverse transform predictions and true y as we predicted the model on scaled values
predictions = sc_y.inverse_transform(predictions_scaled.reshape(-1, 1))

print('Mean Absolute Error:',metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:',metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:',math.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R-Squared:', metrics.r2_score(y_test, predictions))

Mean Absolute Error: 2151.9289815423995
Mean Squared Error: 21018755.940760996
Root Mean Squared Error: 4584.621679131332
R-Squared: 0.8566488991993187


### Random Forest Regressor Model Fit

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [63]:
regressor = make_pipeline(StandardScaler(), RandomForestRegressor())

In [64]:
regressor.fit(X_train, y_train)

In [65]:
predictions = regressor.predict(X_test)

In [66]:
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:',metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:',math.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R-Squared:', metrics.r2_score(y_test, predictions))

Mean Absolute Error: 2562.3505215686573
Mean Squared Error: 21075179.940025207
Root Mean Squared Error: 4590.7711705142965
R-Squared: 0.856264078973569


### Regression Model Comparison:

| Model                  | RMSE   | R² Score  |
|------------------------|--------|---------- |
| Linear Regression      | 5812.10 | 0.77     |
| SVR Regressor          | 4584.62 | 0.87     |
| Random Forest Regressor| 4590.77 | 0.86     |

SVR Regressor had the best R² score (0.87), capturing the most variance in the dataset. Random Forest Regresser may be failed here as compared to SVR Regressor due to outliers or maybe the hypertune parameters used here as default which maybe causing this to not get the best score here.