In [55]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer
import optuna

import xgboost as xgb
import gc
import math, random
import lightgbm as lgb
from sklearn.metrics import f1_score, classification_report

In [56]:
df=pd.read_csv("../input/techbomb2/main.csv")

df=df[['Date', 'Time',
       'Current', 'Humidity', 'Temperature', 'Flow', 'Job Temp', 'Voltage',
       'Defect']]

# Replacing NaN values
df.loc[266339,'Current']=0.41
df.loc[524124,'Current']=1.07
df.loc[814609,'Current']=-0.6

df.drop([
    'Date', 
         'Time'
        ], axis=1, inplace=True)

# Mapping number to defects
def mapping_defects(elem):
    if 'No Defect' in elem:
        return 0
    elif 'Tungsten Inclusion' in elem:
        return 1
    elif 'Porosity' in elem:
        return 2
    else:
        pass
    
df['Defect'] = df['Defect'].map(lambda x:mapping_defects(x))
df['Temperature']=abs(df['Temperature'])
df['Humidity']=abs(df['Humidity'])



Columns (1,2) have mixed types.Specify dtype option on import or set low_memory=False.



In [57]:
df=df[df.Humidity < np.percentile(df.Humidity,99)]
df=df[df.Temperature < np.percentile(df.Temperature,99)]

df=df[df["Job Temp"] < np.percentile(df["Job Temp"],96)]


In [4]:
index = df.index
defect = df['Defect'].values

In [5]:
# Shifting previous data and making it as a feature for current row

# only previous 4 rows are considered but it can be adjusted 
temp_dict = [{f"{i}": df[f"{i}"].values,
              f"{i}_1": df[f"{i}"].shift(1),
              f"{i}_2": df[f"{i}"].shift(2),
              f"{i}_3": df[f"{i}"].shift(3),
              f"{i}_4": df[f"{i}"].shift(4) ,
#               f"{i}_5": df[f"{i}"].shift(5),
#               f"{i}_6": df[f"{i}"].shift(6),
#               f"{i}_7": df[f"{i}"].shift(7),
#               f"{i}_8": df[f"{i}"].shift(8)
             } for i in df.columns[:-1]]
             

In [6]:
df_Current = pd.DataFrame(temp_dict[0])
df_Humidity = pd.DataFrame(temp_dict[1])
df_Temperature = pd.DataFrame(temp_dict[2])
df_Flow = pd.DataFrame(temp_dict[3])
df_job_temp = pd.DataFrame(temp_dict[4])
df_Voltage = pd.DataFrame(temp_dict[5])

In [7]:
final_df = pd.concat([df_Current, df_Humidity,df_Temperature, df_Flow,df_job_temp, df_Voltage], axis=1)
final_df['Defect']= df['Defect'].values

In [8]:
label = final_df['Defect']

In [9]:
import sklearn
train, test = sklearn.model_selection.train_test_split(final_df, test_size=0.01,stratify=label)

In [12]:
def evaluate_weightedF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='weighted')
    return ('weightedF1', f1, True)

In [27]:
def objective(trial):
    
    train_x, test_x, train_y, test_y =sklearn.model_selection.train_test_split(train.drop('Defect', axis=1),train['Defect'], test_size=0.1,stratify=train['Defect'])
    

    param = {
        'objective':"multiclass",
        'random_state': 48,
        'n_estimators': 20000,
        'metric':"multi_logloss",

        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'device':'gpu'
    }
    
    model =  lgb.LGBMClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)

    f1 = f1_score(test_y, preds, average='weighted')
    print(f1)
    return f1

In [28]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2022-12-04 05:53:17,020][0m A new study created in memory with name: no-name-450cda1f-1500-46b2-965e-2475cd248e87[0m
[32m[I 2022-12-04 05:55:53,257][0m Trial 0 finished with value: 0.9944041715157363 and parameters: {'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 100, 'num_leaves': 520, 'min_child_samples': 48}. Best is trial 0 with value: 0.9944041715157363.[0m


0.9944041715157363


[32m[I 2022-12-04 05:59:26,346][0m Trial 1 finished with value: 0.9943651787452139 and parameters: {'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.017, 'max_depth': 20, 'num_leaves': 967, 'min_child_samples': 207}. Best is trial 0 with value: 0.9944041715157363.[0m


0.9943651787452139


[32m[I 2022-12-04 06:02:31,637][0m Trial 2 finished with value: 0.9942314406562343 and parameters: {'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 100, 'num_leaves': 922, 'min_child_samples': 215}. Best is trial 0 with value: 0.9944041715157363.[0m


0.9942314406562343


[33m[W 2022-12-04 06:03:02,243][0m Trial 3 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_23/3333571382.py", line 24, in objective
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
  File "/opt/conda/lib/python3.7/site-packages/lightgbm/sklearn.py", line 972, in fit
    callbacks=callbacks, init_model=init_model)
  File "/opt/conda/lib/python3.7/site-packages/lightgbm/sklearn.py", line 758, in fit
    callbacks=callbacks
  File "/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py", line 292, in train
    booster.update(fobj=fobj)
  File "/opt/conda/lib/python3.7/site-packages/lightgbm/basic.py", line 3023, in update
    ctypes.byref(is_finished)))
KeyboardInterrupt


KeyboardInterrupt: 

In [30]:
print('Best trial:', study.best_trial.params, ' with F1 Score', study.best_value)

Best trial: {'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 100, 'num_leaves': 520, 'min_child_samples': 48}  with F1 Score 0.9944041715157363


In [47]:
params = study.best_params   
params['random_state'] = 48
params['n_estimators'] = 20000 
params['metric'] = 'multi_logloss'
params['objective'] = "multiclass"

In [58]:
import warnings
warnings.filterwarnings("ignore")

In [70]:
columns = [i for i in final_df.columns if i != 'Defect']
preds = np.zeros(test.shape[0])

kf = sklearn.model_selection.StratifiedKFold(n_splits=3,random_state=48,shuffle=True)

F1_SCORES=[] 

n=0
for trn_idx, test_idx in kf.split(train[columns],train['Defect']):
    
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=train['Defect'].iloc[trn_idx],train['Defect'].iloc[test_idx]
    
    model =  lgb.LGBMClassifier(**params)
    
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
                                
    preds+=model.predict(test[columns])/kf.n_splits
                                
    F1_SCORES.append(f1_score(y_val, model.predict(X_val), average='weighted'))
    
    print('F1 SCore', n+1,F1_SCORES[n])
    print("classification Report")
    print(classification_report(model.predict(X_val), y_val))
    n+=1
    break

F1 SCore 1 0.9937687004092228
classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    242438
           1       0.37      0.83      0.52       545
           2       0.00      0.00      0.00         0

    accuracy                           1.00    242983
   macro avg       0.46      0.61      0.50    242983
weighted avg       1.00      1.00      1.00    242983



In [74]:
import joblib

joblib.dump(model, 'lgbm_weldtright.pkl')

['lgbm_weldtright.pkl']