#***BANA 6350 : Quantitative Methods***

**Trees Models : Bagging, Boosting, Random Forrest Classifiers, and XGBoost Models**

University of Dallas, Irving, TX

Credits: "Practical Statistics for Data Scientists" 2nd Edition


##**Mounting Google Drive**

In [None]:
# Mount google drive

from google.colab import drive
drive.mount('/content/gdrive/',force_remount=True)

In [None]:
# Right click on the BANA6350>Data folder and copy the folder path by click "Copy Path". Then paste that inside the code below to link your folder where all the data will reside

import os

path = "/content/gdrive/MyDrive/BANA6350/Data"

os.chdir(path)

# the above code will change your current working directory to the path i.e., BANA6350/Data folder


In [None]:
# Let's try opening a file inside our Current working directory:

import pandas as pd
pd.read_csv('state.csv').head()

##**Setting up formatting**

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['lines.linewidth'] = 3
plt.rcParams['figure.figsize'] = [14.0, 6.0]
plt.rcParams['font.size']= 18
plt.style.available   # Check what styles are available for Chart formats by visiting : https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
plt.style.use('fivethirtyeight')       # Assigning the FiveThirtyEight format, you can choose any of the names from the above link

In [None]:
# if for some reason your sns plots are not visible, run this line of code in Colab

%matplotlib inline

##**Importing Commonly used python packages:**

In [None]:
!pip install dmba

In [None]:
import math
from pathlib import Path
import pandas as pd
import numpy as np

from scipy import stats
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import multivariate_normal
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import from_levels_and_colors
import seaborn as sns
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score)
from statsmodels.stats import power
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from dmba import plotDecisionTree, textDecisionTree

import plotly.express as px

# Weekly Coding


In [None]:
creditcard = pd.read_csv("CC Fraud.csv")
creditcard.head()

In [None]:
creditcard.info()

In [None]:
X = creditcard[['gender', 'category', 'state', 'job', 'amt', 'city_pop']]
y = creditcard['is_fraud']

In [None]:
y.unique()

In [None]:
X = pd.get_dummies(X, prefix='', prefix_sep='', drop_first=True)

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True)

In [None]:
X_train

In [None]:
y_train

In [None]:
lm = LogisticRegression(random_state=0)

In [None]:
lm.fit(X_train, y_train)

# Perform prediction on the test set
y_pred_lm = lm.predict(X_test)

# Calculate the accuracy of the model
score_test_lm= lm.score(X_test, y_test)
score_train_lm= lm.score(X_train, y_train)

print("Accuracy Train_lm:", score_train_lm)
print("Accuracy Test_lm:", score_test_lm)
print("Predictions_lm:", y_pred_lm)

In [None]:
# Calculate the confusion matrix
cm_lm = confusion_matrix(y_test, y_pred_lm)

# Calculate the precision, recall, and accuracy

precision_lm = precision_score(y_test, y_pred_lm, average='macro')
recall_lm = recall_score(y_test, y_pred_lm, average='macro')
accuracy_lm = accuracy_score(y_test, y_pred_lm)

# Print the confusion matrix, precision, recall, and accuracy
print('Confusion Matrix_lm:\n', cm_lm)
print('Precision_lm:', precision_lm)
print('Recall_lm:', recall_lm)
print('Accuracy_lm:', accuracy_lm)

In [None]:
#sns.heatmap(cm_lm, annot=True)
confmatrix = confusion_matrix(y_test, y_pred_lm)

# Define labels for the confusion matrix
labels = ['Non-Fraudulent', 'Fraudulent']

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(confmatrix, annot=True, fmt='d', cmap='magma', xticklabels=labels, yticklabels=labels, vmin =0, vmax = 3000)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

In [None]:
#Random Forrest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Build the random forest model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

In [None]:
# Evaluate the model
print('Accuracy_rfc:', rfc.score(X_test, y_test))

# Predict the labels of the test set
y_pred_rfc = rfc.predict(X_test)

# Calculate the accuracy of the model
score_test_rfc = rfc.score(X_test, y_test)
score_train_rfc = rfc.score(X_train, y_train)

print("Accuracy Train_rfc:", score_train_rfc)
print("Accuracy Test_rfc:", score_test_rfc)
print("Predictions_rfc:", y_pred_rfc)

In [None]:
precision_rfc = precision_score(y_test, y_pred_rfc, average='macro')
recall_rfc = recall_score(y_test, y_pred_rfc, average='macro')
accuracy_rfc = accuracy_score(y_test, y_pred_rfc)

In [None]:
print('Precision_lm:', precision_rfc)
print('Recall_lm:', recall_rfc)
print('Accuracy_lm:', accuracy_rfc)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_rfc)

# Define labels for the confusion matrix
labels = ['Non-Fraudulent', 'Fraudulent']

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='magma', xticklabels=labels, yticklabels=labels, vmin = 100, vmax = 3000)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forrest')
plt.show()

In [None]:
feature_importances = pd.Series(rfc.feature_importances_, index=X.columns).sort_values(ascending=False)

top_10 = feature_importances.head(10)
top_10

In [None]:
plt.figure()
top_10.plot(kind='bar')
plt.title("Random Forest Feature Importance")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()


In [None]:
#XGB Boost

In [None]:
from xgboost import XGBClassifier

# Build the XGBoost model
xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate the model
print('Accuracy_xgb:', xgb_model.score(X_test, y_test))

# Predict the labels of the test set
y_pred_xgb = xgb_model.predict(X_test)

# Calculate the accuracy of the model
score_test_xgb = xgb_model.score(X_test, y_test)
score_train_xgb = xgb_model.score(X_train, y_train)

print("Accuracy Train_xgb:", score_train_xgb)
print("Accuracy Test_xgb:", score_test_xgb)
print("Predictions_xgb:", y_pred_xgb)

In [None]:
precision_xgb = precision_score(y_test, y_pred_xgb, average='macro')
recall_xgb = recall_score(y_test, y_pred_xgb, average='macro')
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

In [None]:
print('Precision_lm:', precision_xgb)
print('Recall_lm:', recall_xgb)
print('Accuracy_lm:', accuracy_xgb)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_xgb)

# Define labels for the confusion matrix
labels = ['Non-Fraudulent', 'Fraudulent']

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='magma', xticklabels=labels, yticklabels=labels, vmin = 100, vmax = 3000)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - XGB')
plt.show()

In [None]:
feature_importance = pd.Series(xgb_model.feature_importances_, index=X.columns).sort_values(ascending=False)

top10 = feature_importance.head(10)
top10

In [None]:
plt.figure()
top10.plot(kind='bar')
plt.title("Random Forest Feature Importance - XGB")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()