In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import xgboost
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('aps_failure_training_set.csv', skiprows=20, na_values="na")

## Cleaning and Transforming

In [3]:
df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,neg,33058,,0.0,,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,neg,41040,,228.0,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,neg,12,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,neg,60874,,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [4]:
df.describe()

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
count,60000.0,13671.0,56665.0,45139.0,57500.0,57500.0,59329.0,59329.0,59329.0,59329.0,...,59329.0,59329.0,59329.0,59329.0,59329.0,59329.0,59329.0,59329.0,57276.0,57277.0
mean,59336.5,0.713189,356014300.0,190620.6,6.81913,11.006817,221.6364,975.7223,8606.015,88591.28,...,445489.7,211126.4,445734.3,393946.2,333058.2,346271.4,138730.0,8388.915,0.090579,0.212756
std,145430.1,3.478962,794874900.0,40404410.0,161.543373,209.792592,20478.46,34200.53,150322.0,761731.2,...,1155540.0,543318.8,1168314.0,1121044.0,1069160.0,1728056.0,449510.0,47470.43,4.368855,8.830641
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,834.0,0.0,16.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2936.0,1166.0,2700.0,3584.0,512.0,110.0,0.0,0.0,0.0,0.0
50%,30776.0,0.0,152.0,126.0,0.0,0.0,0.0,0.0,0.0,0.0,...,233796.0,112086.0,221518.0,189988.0,92432.0,41098.0,3812.0,0.0,0.0,0.0
75%,48668.0,0.0,964.0,430.0,0.0,0.0,0.0,0.0,0.0,0.0,...,438396.0,218232.0,466614.0,403222.0,275094.0,167814.0,139724.0,2028.0,0.0,0.0
max,2746564.0,204.0,2130707000.0,8584298000.0,21050.0,20070.0,3376892.0,4109372.0,10552860.0,63402070.0,...,77933930.0,37758390.0,97152380.0,57435240.0,31607810.0,119580100.0,19267400.0,3810078.0,482.0,1146.0


In [5]:
#Columns with highest null counts in dataset
null_counts = df.isnull().sum().sort_values(ascending=False)
null_counts[null_counts > 0].head(10)

br_000    49264
bq_000    48722
bp_000    47740
bo_000    46333
ab_000    46329
cr_000    46329
bn_000    44009
bm_000    39549
bl_000    27277
bk_000    23034
dtype: int64

In [6]:
# Drop columns with more than 80% missing data
missing = df.isnull().mean()
drop_cols = missing[missing >= 0.8].index

df = df.drop(columns=drop_cols)

In [7]:
# Impute on missing values for remaining columns
from sklearn.impute import SimpleImputer
numerical_columns = df.drop(columns=['class']).select_dtypes(include='number').columns
imputer = SimpleImputer(strategy='median')
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

In [8]:
#Convert pos and neg class to 1 and 0
df['class'] = df['class'].map({'pos' : 1, 'neg': 0})

In [9]:
df['class'].value_counts()

class
0    59000
1     1000
Name: count, dtype: int64

In [10]:
df.corr()['class'].sort_values(ascending=False).head()

class     1.000000
ci_000    0.550386
aa_000    0.536978
bt_000    0.534286
bb_000    0.530738
Name: class, dtype: float64

## MODELING SECTION (Logistic Regression)

In [11]:


X = df.drop(columns = ['class'])
y = df['class']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)

In [12]:
#Simple Interpretable Model

#I am weighting class 1 as 50 because it shows up only 2% of the time in our training set
classifier = LogisticRegression(class_weight={0: 1, 1: 50}, max_iter=1000)
classifier.fit(X_train_scaled, y_train)

In [13]:
from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_val_scaled)

confusionMatrix = confusion_matrix(y_val, y_pred)

In [14]:
#[TP, FP(COST 1)]
#[FN (COST 2), TN]
#Cost_1 = 10 and cost_2 = 500
#Total_cost = Cost_1*No_Instances + Cost_2*No_Instances

tn, fp, fn, tp = confusionMatrix.ravel()
total_cost = (10 * fp) + (500 * fn)

print("Logistic Regression Confusion Matrix:", confusionMatrix)
print("Total Cost (Validation):", total_cost)
print(classification_report(y_val, y_pred))
print("ROC AUC:", roc_auc_score(y_val, y_pred))

Logistic Regression Confusion Matrix: [[11439   361]
 [   18   182]]
Total Cost (Validation): 12610
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     11800
           1       0.34      0.91      0.49       200

    accuracy                           0.97     12000
   macro avg       0.67      0.94      0.74     12000
weighted avg       0.99      0.97      0.98     12000

ROC AUC: 0.9397033898305086


In [15]:
##Evaluation on the test data
test_df = pd.read_csv("aps_failure_test_set.csv", skiprows=20, na_values='na')

test_df = test_df.drop(columns=drop_cols)

test_df[numerical_columns] = imputer.transform(test_df[numerical_columns])
X_test = test_df.drop(columns=['class'])
y_test = test_df['class'].map({'pos': 1, 'neg': 0})
X_test_scaled = scaler.transform(X_test)

y_test_pred = classifier.predict(X_test_scaled)
confusionMatrix_test = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = confusionMatrix_test.ravel()
total_cost_test = (fp * 10) + (fn * 500)
print("Test Confusion Matrix:\n", confusionMatrix_test)
print(f"Test Cost: {total_cost_test}")


Test Confusion Matrix:
 [[15216   409]
 [   35   340]]
Test Cost: 21590


# Using SMOTE library to balance data

In [16]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Original:", y_train.value_counts())
print("Balanced:", pd.Series(y_train_balanced).value_counts())



Original: class
0    47200
1      800
Name: count, dtype: int64
Balanced: class
0    47200
1    47200
Name: count, dtype: int64


In [17]:
clf_smote = LogisticRegression(max_iter=1000)
clf_smote.fit(X_train_balanced, y_train_balanced)

y_val_pred_smote = clf_smote.predict(X_val)
y_val_proba_smote = clf_smote.predict_proba(X_val)[:, 1]

cm_smote = confusion_matrix(y_val, y_val_pred_smote)
tn, fp, fn, tp = cm_smote.ravel()
total_cost_smote = (fp * 10) + (fn * 500)

print("Logistic Regression SMOTE Confusion Matrix:\n", cm_smote)
print("Total Cost (Validation):", total_cost_smote)
print(classification_report(y_val, y_val_pred_smote))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_smote))

Logistic Regression SMOTE Confusion Matrix:
 [[11498   302]
 [   16   184]]
Total Cost (Validation): 11020
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     11800
           1       0.38      0.92      0.54       200

    accuracy                           0.97     12000
   macro avg       0.69      0.95      0.76     12000
weighted avg       0.99      0.97      0.98     12000

ROC AUC: 0.963186652542373


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
y_test_pred_smote = clf_smote.predict(X_test)
y_test_proba_smote = clf_smote.predict_proba(X_test)[:, 1]

cm_test_smote = confusion_matrix(y_test, y_test_pred_smote)
tn, fp, fn, tp = cm_test_smote.ravel()
total_cost_test_smote = (fp * 10) + (fn * 500)

print("Test Confusion Matrix (SMOTE):\n", cm_test_smote)
print("Test Total Cost (SMOTE):", total_cost_test_smote)

Test Confusion Matrix (SMOTE):
 [[15264   361]
 [   35   340]]
Test Total Cost (SMOTE): 21110


## Random Forest (no weighting)

In [19]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_val_pred_rf = rf.predict(X_val)
y_val_proba_rf = rf.predict_proba(X_val)[:, 1]


random_forest_cm = confusion_matrix(y_val, y_val_pred_rf)
tn, fp, fn, tp = random_forest_cm.ravel()

COST_1 = 10
COST_2 = 500
total_cost_rf = (fp * COST_1) + (fn * COST_2)

In [20]:
print("Random Forest Confusion Matrix:", random_forest_cm)
print("Total Cost (Validation):", total_cost_rf)
print(classification_report(y_val, y_val_pred_rf))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_rf))

Random Forest Confusion Matrix: [[11785    15]
 [   58   142]]
Total Cost (Validation): 29150
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11800
           1       0.90      0.71      0.80       200

    accuracy                           0.99     12000
   macro avg       0.95      0.85      0.90     12000
weighted avg       0.99      0.99      0.99     12000

ROC AUC: 0.991927966101695


In [21]:
y_test_pred = rf.predict(X_test)
confusionMatrix_test = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = confusionMatrix_test.ravel()
total_cost_test = (fp * 10) + (fn * 500)
print("Test Confusion Matrix:\n", confusionMatrix_test)
print(f"Test Cost: {total_cost_test}")

Test Confusion Matrix:
 [[15608    17]
 [  119   256]]
Test Cost: 59670


## Random Forest (class weighting) 

In [22]:
rf2 = RandomForestClassifier(class_weight={0: 1, 1: 50}, random_state=42)
rf2.fit(X_train, y_train)

y_val_pred_rf2 = rf2.predict(X_val)
y_val_proba_rf2 = rf2.predict_proba(X_val)[:, 1]


random_forest_cm2 = confusion_matrix(y_val, y_val_pred_rf2)
tn, fp, fn, tp = random_forest_cm2.ravel()

COST_1 = 10
COST_2 = 500
total_cost_rf2 = (fp * COST_1) + (fn * COST_2)

In [23]:
print("Random Forest (with weighting) Confusion Matrix:", random_forest_cm2)
print("Total Cost (Validation):", total_cost_rf2)
print(classification_report(y_val, y_val_pred_rf2))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_rf2))

Random Forest (with weighting) Confusion Matrix: [[11785    15]
 [   79   121]]
Total Cost (Validation): 39650
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     11800
           1       0.89      0.60      0.72       200

    accuracy                           0.99     12000
   macro avg       0.94      0.80      0.86     12000
weighted avg       0.99      0.99      0.99     12000

ROC AUC: 0.9907023305084746


In [24]:
y_test_pred = rf2.predict(X_test)
confusionMatrix_test = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = confusionMatrix_test.ravel()
total_cost_test = (fp * 10) + (fn * 500)
print("Test Confusion Matrix:\n", confusionMatrix_test)
print(f"Test Cost: {total_cost_test}")

Test Confusion Matrix:
 [[15609    16]
 [  165   210]]
Test Cost: 82660


## Random Forest (SMOTE)

In [25]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)

y_val_pred_rf_smote = rf_smote.predict(X_val)
y_val_proba_rf_smote = rf_smote.predict_proba(X_val)[:, 1]

cm_rf_smote = confusion_matrix(y_val, y_val_pred_rf_smote)
tn, fp, fn, tp = cm_rf_smote.ravel()
cost_rf_smote = (fp * 10) + (fn * 500)

print("Random Forest + SMOTE Confusion Matrix:\n", cm_rf_smote)
print("Total Cost (Validation):", cost_rf_smote)
print(classification_report(y_val, y_val_pred_rf_smote))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_rf_smote))



Random Forest + SMOTE Confusion Matrix:
 [[11715    85]
 [   33   167]]
Total Cost (Validation): 17350
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     11800
           1       0.66      0.83      0.74       200

    accuracy                           0.99     12000
   macro avg       0.83      0.91      0.87     12000
weighted avg       0.99      0.99      0.99     12000

ROC AUC: 0.992845127118644


In [26]:
y_test_pred_rf_smote = rf_smote.predict(X_test)
cm_rf_test_smote = confusion_matrix(y_test, y_test_pred_rf_smote)
tn, fp, fn, tp = cm_rf_test_smote.ravel()
cost_rf_test_smote = (fp * 10) + (fn * 500)
print("Random Forest + SMOTE Test Cost:", cost_rf_test_smote)

Random Forest + SMOTE Test Cost: 32340


# XGBoost

In [27]:
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

y_val_pred_xgb = xgb.predict(X_val)
y_val_proba_xgb = xgb.predict_proba(X_val)[:, 1]

cm_xgb = confusion_matrix(y_val, y_val_pred_xgb)
tn, fp, fn, tp = cm_xgb.ravel()

total_cost_xgb = (fp * COST_1) + (fn * COST_2)

print("XGBoost Confusion Matrix:\n", cm_xgb)
print("Total Cost (Validation):", total_cost_xgb)
print(classification_report(y_val, y_val_pred_xgb))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_xgb))

XGBoost Confusion Matrix:
 [[11779    21]
 [   45   155]]
Total Cost (Validation): 22710
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11800
           1       0.88      0.78      0.82       200

    accuracy                           0.99     12000
   macro avg       0.94      0.89      0.91     12000
weighted avg       0.99      0.99      0.99     12000

ROC AUC: 0.9933161016949152


In [28]:
y_test_pred = xgb.predict(X_test)
confusionMatrix_test = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = confusionMatrix_test.ravel()
total_cost_test = (fp * 10) + (fn * 500)
print("Test Confusion Matrix:\n", confusionMatrix_test)
print(f"Test Cost: {total_cost_test}")

Test Confusion Matrix:
 [[15610    15]
 [   80   295]]
Test Cost: 40150


# XGBoost (with weighting)

In [29]:
pos_weight = sum(y_train == 0) / sum(y_train == 1)

xgb2 = XGBClassifier(eval_metric='logloss', scale_pos_weight=pos_weight, random_state=42)
xgb2.fit(X_train, y_train)

y_val_pred_xgb2 = xgb2.predict(X_val)
y_val_proba_xgb2 = xgb2.predict_proba(X_val)[:, 1]

cm_xgb2 = confusion_matrix(y_val, y_val_pred_xgb2)
tn, fp, fn, tp = cm_xgb2.ravel()

total_cost_xgb2 = (fp * COST_1) + (fn * COST_2)

print("XGBoost Confusion (with weighting) Matrix:\n", cm_xgb2)
print("Total Cost (Validation):", total_cost_xgb2)
print(classification_report(y_val, y_val_pred_xgb2))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_xgb2))

XGBoost Confusion (with weighting) Matrix:
 [[11756    44]
 [   35   165]]
Total Cost (Validation): 17940
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11800
           1       0.79      0.82      0.81       200

    accuracy                           0.99     12000
   macro avg       0.89      0.91      0.90     12000
weighted avg       0.99      0.99      0.99     12000

ROC AUC: 0.9927156779661017


In [30]:
y_test_pred = xgb2.predict(X_test)
confusionMatrix_test = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = confusionMatrix_test.ravel()
total_cost_test = (fp * 10) + (fn * 500)
print("Test Confusion Matrix:\n", confusionMatrix_test)
print(f"Test Cost: {total_cost_test}")

Test Confusion Matrix:
 [[15579    46]
 [   69   306]]
Test Cost: 34960


# XGBoost (SMOTE)

In [31]:
xgb_smote = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_smote.fit(X_train_smote, y_train_smote)

y_val_pred_xgb_smote = xgb_smote.predict(X_val)
y_val_proba_xgb_smote = xgb_smote.predict_proba(X_val)[:, 1]

cm_xgb_smote = confusion_matrix(y_val, y_val_pred_xgb_smote)
tn, fp, fn, tp = cm_xgb_smote.ravel()
cost_xgb_smote = (fp * 10) + (fn * 500)

print("XGBoost + SMOTE Confusion Matrix:\n", cm_xgb_smote)
print("Total Cost (Validation):", cost_xgb_smote)
print(classification_report(y_val, y_val_pred_xgb_smote))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_xgb_smote))

XGBoost + SMOTE Confusion Matrix:
 [[11747    53]
 [   33   167]]
Total Cost (Validation): 17030
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11800
           1       0.76      0.83      0.80       200

    accuracy                           0.99     12000
   macro avg       0.88      0.92      0.90     12000
weighted avg       0.99      0.99      0.99     12000

ROC AUC: 0.9911021186440678


In [32]:
y_test_pred_xgb_smote = xgb_smote.predict(X_test)
cm_xgb_test_smote = confusion_matrix(y_test, y_test_pred_xgb_smote)
tn, fp, fn, tp = cm_xgb_test_smote.ravel()
cost_xgb_test_smote = (fp * 10) + (fn * 500)
print("XGBoost + SMOTE Test Cost:", cost_xgb_test_smote)

XGBoost + SMOTE Test Cost: 31120
