In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [34]:
data = pd.read_csv("../data/synthetic_real_data.csv")
test_data = pd.read_csv("../data/test_modified.csv")

#### Different types of Faults

1. `Pastry`: Small patches or irregularities on the steel surface cause during manufactuing or transport. It affects the smoothness and appearance of steel surface.
2. `Z_Scratch`: Narrow scratches or marks on the surface of steel parellel to rolling direction, caused by handling, machining, or contact with abrasive materials.
3. `K_Scratch`: Similar to Z_scratch but run perpendicular to rolling direction. Caused by same factors.
4. `Stains`: Refers to discolored areas on the plate. They are caused by rust, grease, oil or other foreign substances that come in contact with steel during processing, storage and handling.
5. `Dirtiness`: Presence of dirt or partiulate matter on the steel surface. This is caused during manufacturing, handling and storage processes.
6. `Bumps`: Raised or protruding areas on the surface of the steel plate that are caused by irregularities in the manufacturing process like uneven rolling, cooling or physical damage.
7. `Other_Faults`: Other faults that are not covered like surface imperfections, irregularities or abnormalities.

In [35]:
data.columns

Index(['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
       'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index',
       'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index',
       'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index',
       'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas',
       'TypeOfSteel', 'Fault_class'],
      dtype='object')

In [36]:
data['X_range'] = data.X_Maximum - data.X_Minimum
data['Y_range'] = data.Y_Maximum - data.Y_Minimum
data['fault_density'] = data.Pixels_Areas / (data.X_Perimeter*data.Y_Perimeter)
data['luminosity_range'] = data.Maximum_of_Luminosity - data.Minimum_of_Luminosity

test_data['X_range'] = test_data.X_Maximum - test_data.X_Minimum
test_data['Y_range'] = test_data.Y_Maximum - test_data.Y_Minimum
test_data['fault_density'] = test_data.Pixels_Areas / (test_data.X_Perimeter*test_data.Y_Perimeter)
test_data['luminosity_range'] = test_data.Maximum_of_Luminosity - test_data.Minimum_of_Luminosity

In [37]:
data = data[data.X_range > 0]
data = data[data.Y_range > 0]
data = data[data.luminosity_range > 0]

In [38]:
features = ['Pixels_Areas', 'Sum_of_Luminosity', 'Length_of_Conveyer', 'Steel_Plate_Thickness', 
            'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 
            'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index', 
            'SigmoidOfAreas', 'TypeOfSteel', 'X_range', 'Y_range', 'fault_density', 'luminosity_range']
target = ['Fault_class']

In [39]:
X_train = data[features]
y_train = data[target]

X_test = test_data[features]

In [42]:
from sklearn.model_selection import train_test_split
from datetime import datetime

In [41]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=37)

In [51]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve

In [44]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [48]:
accuracy_score(y_val, xgb.predict(X_val))

0.6152032999410725

In [54]:
from tensorflow.keras.utils import to_categorical
y_true = to_categorical(y_val)
y_pred = xgb.predict_proba(X_val)

2024-03-26 12:24:45.948916: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [55]:
roc_auc_score(y_true, y_pred)

0.9109979315697517

In [49]:
print(classification_report(y_val, xgb.predict(X_val)))

              precision    recall  f1-score   support

           0       0.54      0.44      0.49       800
           1       0.69      0.69      0.69       697
           2       0.87      0.90      0.89       947
           3       0.84      0.87      0.86       531
           4       0.61      0.52      0.56       565
           5       0.52      0.50      0.51      1382
           6       0.50      0.56      0.53      1866

    accuracy                           0.62      6788
   macro avg       0.65      0.64      0.64      6788
weighted avg       0.62      0.62      0.61      6788



In [None]:
submission = pd.read_csv("../data/submission_basic_model.csv")
submission.iloc[:,1:] = xgb.predict_proba(X_test)
submission.to_csv("../data/submission_xgb_synthetic_data.csv", index=False)