In [55]:
import warnings
from numba import NumbaDeprecationWarning
warnings.filterwarnings("ignore", category=NumbaDeprecationWarning)

import numpy as np
import pandas as pd
import wandb
import joblib

# Data Standardization and Encoding
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Modelling
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load data



In [59]:
wandb.init(project='steel')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msaahilkatariads[0m ([33msaahilkatariads-MCKV Institute of Engineering[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [60]:
df_train = pd.read_csv('playground-series-s4e3/train.csv')
df_test = pd.read_csv('playground-series-s4e3/test.csv')


print('Train data set : {}'.format(df_train.shape))
print('Test data set : {}'.format(df_test.shape))



Train data set : (19219, 35)
Test data set : (12814, 28)


In [61]:


# Define numerical features and targets
numerical_features = [
    'Sum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400',
    'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index',
    'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index',
    'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index',
    'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas'
]
target_features = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']




In [62]:
print("Columns in df_train:", df_train.columns.tolist())
print("Columns in df_test:", df_test.columns.tolist())

Columns in df_train: ['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
Columns in df_test: ['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', '

In [63]:

def preprocess_data(df):

    """
    Preprocess the data by engineering features and dropping unnecessary columns.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The preprocessed DataFrame with engineered features and unnecessary columns removed.
    """

    # Feature Engineering
    columns_in_df = df.columns
    print("Columns in DataFrame:", columns_in_df)
    try:
        df['X'] = df['X_Maximum'] - df['X_Minimum']
        df['Y'] = df['Y_Maximum'] - df['Y_Minimum']
        df['Luminosity'] = df['Maximum_of_Luminosity'] - df['Minimum_of_Luminosity']
        df['Area_Perimeter_Ratio'] = df['Pixels_Areas'] / (df['X_Perimeter'] + df['Y_Perimeter'])
        
        # Drop original columns
        df = df.drop(['X_Maximum', 'X_Minimum', 'Y_Maximum', 'Y_Minimum', 'Maximum_of_Luminosity', 'Minimum_of_Luminosity',
                    'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter'], axis=1)
    except KeyError as e:
        print(f"Missing column: {e}")
    
    return df



In [64]:
def build_pipeline():
    # Preprocessing pipeline

    """
    Build a machine learning pipeline that includes preprocessing, feature selection, and model training.

    Returns:
    Pipeline: A scikit-learn pipeline object that includes preprocessing, feature selection, and the XGBoost classifier.
    """

    numerical_transformer = Pipeline(steps=[
        ('scaler', RobustScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features)
        ]
    )
    
    # Complete pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_classif, k='all')),
        ('model', XGBClassifier(learning_rate=0.01, n_estimators=300, objective='binary:logistic'))
    ])
    
    return pipeline


In [65]:
def train_and_evaluate(X_train, y_train, X_test, y_test, target_name):
    """
    Train the model and evaluate its performance, logging metrics to W&B.

    Parameters:
    X_train (pd.DataFrame): Training features.
    y_train (pd.Series): Training target.
    X_test (pd.DataFrame): Test features.
    y_test (pd.Series): Test target.
    target_name (str): The name of the target variable for logging purposes.
    """
    # Define the pipeline
    pipeline = build_pipeline()
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test)
    
    # Evaluation metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    
    # Log metrics to W&B
    wandb.log({
        f"{target_name} Accuracy": accuracy,
        f"{target_name} F1 Score": f1,
        f"{target_name} Precision": precision,
        f"{target_name} Recall": recall
    })
    
    print(f"Target: {target_name}")
    print("Accuracy: ", accuracy)
    print("F1 Score: ", f1)
    print("Precision: ", precision)
    print("Recall: ", recall)
    

In [66]:
model_saved = False

df_train = preprocess_data(df_train)
df_test = preprocess_data(df_test)
    
X = df_train.drop(target_features + ['id'], axis=1)
y = df_train[target_features]

for i, target in enumerate(target_features):
    print(f"Processing target: {target}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y[target], test_size=0.3, random_state=42)
    
    # Oversample
    smote = SMOTE(sampling_strategy='auto')
    X_smote, y_smote = smote.fit_resample(X_train, y_train)
    
    # Train and evaluate
    train_and_evaluate(X_smote, y_smote, X_test, y_test, target)
    
    # Save model only once
    if not model_saved:
        joblib.dump(build_pipeline().fit(X_smote, y_smote), 'model.joblib')
        model_saved = True

# Finish W&B run
wandb.finish()

Columns in DataFrame: Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')
Columns in DataFrame: Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'E

0,1
Bumps Accuracy,▁
Bumps F1 Score,▁
Bumps Precision,▁
Bumps Recall,▁
Dirtiness Accuracy,▁
Dirtiness F1 Score,▁
Dirtiness Precision,▁
Dirtiness Recall,▁
K_Scatch Accuracy,▁
K_Scatch F1 Score,▁

0,1
Bumps Accuracy,0.76448
Bumps F1 Score,0.52384
Bumps Precision,0.5134
Bumps Recall,0.53472
Dirtiness Accuracy,0.90947
Dirtiness F1 Score,0.23907
Dirtiness Precision,0.15046
Dirtiness Recall,0.58156
K_Scatch Accuracy,0.95959
K_Scatch F1 Score,0.89066
