In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_excel('Final_ACL_Anon_2022-11-09.xlsx')
print(df)

In [None]:
#Read the data
df = pd.read_excel('Final_ACL_Anon_2022-11-09.xlsx')
print(df)

In [None]:
#Plot correlation among features
plt.figure(figsize=(50,50))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Cleaning the data

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)


In [None]:
data = df.iloc[:, 2:44] #excluding non-numeric features
clean_dataset(data)
print(data)

In [None]:
#Print any missing records
missing = df.merge(data, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']

print(missing)

In [None]:
#Drop "sysyem_mass"
df.drop(['System_Mass'], axis=1, inplace=True)

In [None]:
#Initiaizing target label
y = df['Status']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42) #splitting train and test data
X_train.shape, X_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5,
                                       n_estimators=100, oob_score=True)

classifier_rf.fit(X_train, y_train)
y_pred=classifier_rf.predict(X_test)
y_train_hat = model.predict(X_train)


In [None]:
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
#Hyper-parameter training
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")
grid_search.fit(X_train, y_train)

In [None]:
#printing the best parameters
print(grid_search.best_score_)
rf_best = grid_search.best_estimator_
print(rf_best)

In [None]:
#Classifier after hyper-parameter tuning
classifier_rf = RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_jobs=-1,
                       random_state=42)
classifier_rf.fit(X_train, y_train)
y_pred=classifier_rf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

In [None]:
#Feature importances
print(rf_best.feature_importances_)
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf_best.feature_importances_
})
print(imp_df.sort_values(by="Imp", ascending=False))

In [None]:
#Plotting feature importance

from matplotlib import pyplot as plt
sorted_idx = rf_best.feature_importances_.argsort()
plt.figure(figsize=(10, 10))
plt.barh(X_train.columns[sorted_idx], rf_best.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")