In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report 

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
X = pd.read_csv('/kaggle/input/the-broken-machine/xtrain.csv')
y = pd.read_csv('/kaggle/input/the-broken-machine/ytrain.csv')

In [None]:
X = X[:10000]
y = y[:10000]

In [None]:
X_copy = X.copy()
y['x'].replace({0: "OK", 1: "FAIL"},inplace=True)

In [None]:
X_copy = X_copy.fillna(method='bfill').fillna(method='ffill')
X_copy.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_copy, y, test_size=0.2, shuffle=True, random_state=1)

## PCA

In [None]:
n_components = 10

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(StandardScaler().fit_transform(X_train)), columns=["PC" + str(i) for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(StandardScaler().fit_transform(X_test)), columns=["PC" + str(i) for i in range(1, n_components + 1)])

In [None]:
def evaluate_model(model, X_test, y_test):

    acc = model.score(X_test, y_test)
    print("Acc: {:.2f}%".format(acc*100))

    y_pred = model.predict(X_test) 

    cm = confusion_matrix(y_test, y_pred, labels=['OK','FAIL'])
    clr = classification_report(y_test, y_pred, labels=['OK','FAIL'])
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
    plt.xticks(ticks=[0.5, 1.5], labels=["OK", "FAIL"])
    plt.yticks(ticks=[0.5, 1.5], labels=["OK", "FAIL"])
    plt.xlabel("Predict")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print(model,"\n", clr)

In [None]:
y_train['x'].value_counts()

In [None]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train_reduced,y_train.values.ravel())
evaluate_model(lr,X_test_reduced, y_test.values.ravel())

xgb = XGBClassifier(gamma=0,eval_metric='error',n_estimators=100)
xgb.fit(X_train_reduced,y_train.values.ravel())
evaluate_model(xgb,X_test_reduced, y_test.values.ravel())

## OverSampling

In [None]:
oversampler = RandomOverSampler(random_state=1)
X_train_os, y_train_os = oversampler.fit_resample(X_train_reduced, y_train)

In [None]:
y_train_os['x'].value_counts()

In [None]:
lr.fit(X_train_os, y_train_os.values.ravel())
evaluate_model(lr, X_test_reduced, y_test.values.ravel())

xgb.fit(X_train_os, y_train_os.values.ravel())
evaluate_model(xgb, X_test_reduced, y_test.values.ravel())

## UnderSampling

In [None]:
undersampler = RandomUnderSampler(random_state=1)
X_train_us, y_train_us = undersampler.fit_resample(X_train_reduced, y_train)

In [None]:
y_train_us['x'].value_counts()

In [None]:
lr.fit(X_train_us, y_train_us.values.ravel())
evaluate_model(lr, X_test_reduced, y_test.values.ravel())

xgb.fit(X_train_us, y_train_us.values.ravel())
evaluate_model(xgb, X_test_reduced, y_test.values.ravel())