#***BANA 6350 : Quantitative Methods***

**Trees Models : Bagging, Boosting, Random Forrest Classifiers, and XGBoost Models**

University of Dallas, Irving, TX

Credits: "Practical Statistics for Data Scientists" 2nd Edition


##**Mounting Google Drive**

In [None]:
# Mount google drive

from google.colab import drive
drive.mount('/content/gdrive/',force_remount=True)

In [None]:
# Right click on the BANA6350>Data folder and copy the folder path by click "Copy Path". Then paste that inside the code below to link your folder where all the data will reside

import os

path = "/content/gdrive/MyDrive/BANA6350/Data"

os.chdir(path)

# the above code will change your current working directory to the path i.e., BANA6350/Data folder


In [None]:
# Let's try opening a file inside our Current working directory:

import pandas as pd
pd.read_csv('state.csv').head()

##**Setting up formatting**

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['lines.linewidth'] = 3
plt.rcParams['figure.figsize'] = [14.0, 6.0]
plt.rcParams['font.size']= 18
plt.style.available   # Check what styles are available for Chart formats by visiting : https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
plt.style.use('fivethirtyeight')       # Assigning the FiveThirtyEight format, you can choose any of the names from the above link

In [None]:
# if for some reason your sns plots are not visible, run this line of code in Colab

%matplotlib inline

##**Importing Commonly used python packages:**

In [None]:
!pip install dmba

In [None]:
import math
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import preprocessing
import matplotlib.pyplot as plt


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from dmba import plotDecisionTree, textDecisionTree
#import  pydotplus
from graphviz import Digraph
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Weekly Coding


In [None]:
df = pd.read_csv('adult.csv')
df

In [None]:
df['salary'].unique()

In [None]:
df

In [None]:
predictors = ['age', 'workclass', 'census_weight', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country']
outcome = 'salary'

In [None]:
outcome

In [None]:
X = df[predictors]
X

In [None]:
y= df[outcome]
y

In [None]:
X = pd.get_dummies(X, prefix='', prefix_sep='', drop_first=True)
X

In [None]:
X.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
rf_default = RandomForestClassifier()
rf_default.fit(X_train, y_train)

In [None]:
print(rf_default.get_params())

In [None]:
y_fitted = rf_default.predict(X_train)
y_test_pred = rf_default.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, y_fitted)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

In [None]:
df.info()

In [None]:
X = df[['age', 'census_weight', 'hours_per_week']]
X

In [None]:
y = df['salary']
y

In [None]:
predictors = ['age', 'workclass', 'census_weight', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country']
outcome = 'salary'

In [None]:
X = df[predictors]
X

In [None]:
y = df[outcome]

In [None]:

y = np.where(df['salary'].str.strip() == '>50K', 1, 0)

y

In [None]:
X = pd.get_dummies(X, drop_first=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [None]:
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_classifier.fit(X_train, y_train)

In [None]:
y_fitted = xgb_classifier.predict(X_train)
y_test_pred = xgb_classifier.predict(X_test)

In [None]:
train_accuracy = accuracy_score(y_train, y_fitted)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

In [None]:
rf_feature_importance = pd.Series(rf_default.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Random Forest Feature Importance:")
print(rf_feature_importance)

In [None]:
plt.figure(figsize=(10, 6))
rf_feature_importance.plot(kind='bar')
plt.title("Random Forest Feature Importance")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()

In [None]:
xgb_feature_importance = pd.Series(xgb_classifier.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nXGBoost Feature Importance:")
print(xgb_feature_importance)

In [None]:
plt.figure(figsize=(10, 6))
xgb_feature_importance.plot(kind='bar')
plt.title("XGBoost Feature Importance")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()