In [34]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score

import optuna

import warnings
warnings.filterwarnings('ignore')

In [35]:
# Import competition data
train = pd.read_csv('../dataset/train.csv')
test  = pd.read_csv('../dataset/test.csv')

train.drop(['id'],axis=1,inplace=True)
train.head(3)

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,5448.79,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,936.71,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,1754.01,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False


In [36]:
# Import original data
original_data = pd.read_csv('../dataset/jm1.csv')

# All columns should be numerical but some have "?" for missing values so they get parsed as dtype Object
# Convert columns with dtype 'object' to 'float64'
for col in original_data.columns:
    if original_data[col].dtype == 'object':
        original_data[col] = pd.to_numeric(original_data[col], errors='coerce')  # 'coerce' will turn non-convertible values to NaN

# Fill missing values with mean value
original_data.fillna(original_data.mean(),inplace=True)

# Data augmentation using the original dataset (the Kaggle commpetition says this is allowed))
train = pd.concat([train,original_data],axis=0)

#Remove duplicates
train.drop_duplicates(inplace=True)
train = train.reset_index(drop=True)

# Check data duplication
print('Duplication in train set: ', train.duplicated().sum())
print('Duplication in test set: ', test.duplicated().sum())
train.info()

Duplication in train set:  0
Duplication in test set:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110675 entries, 0 to 110674
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   loc                110675 non-null  float64
 1   v(g)               110675 non-null  float64
 2   ev(g)              110675 non-null  float64
 3   iv(g)              110675 non-null  float64
 4   n                  110675 non-null  float64
 5   v                  110675 non-null  float64
 6   l                  110675 non-null  float64
 7   d                  110675 non-null  float64
 8   i                  110675 non-null  float64
 9   e                  110675 non-null  float64
 10  b                  110675 non-null  float64
 11  t                  110675 non-null  float64
 12  lOCode             110675 non-null  int64  
 13  lOComment          110675 non-null  int64  
 14  lOBlank            110675 non-null  int64  

In [37]:
# Feature engineering
def generate_additional_features(df):
    df0 = df.copy()

    # Statistical Features
    df['mean'] = df0.mean(axis=1)
    df['median'] = df0.median(axis=1)
    df['std_dev'] = df0.std(axis=1)
    df['range'] = df0.max(axis=1) - df0.min(axis=1)
    df['variance'] = df0.var(axis=1)
    
    # Derived Features
    colums = df0.columns
    for col in colums:
        df[col+'_squared'] = df[col] ** 2
        df[col+'_feature'] = np.log(df[col] + 1)  # Adding 1 to avoid log(0)
    
    # Find highly correlated features
    threshold=0.93
    corr_matrix = df0.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    highly_correlated_features = [column for column in upper.columns if any(upper[column] > threshold)]
    print('Number of highly correlated features: ',len(highly_correlated_features))
    
    # Generate interaction features from highly correlated features
    for feature1 in highly_correlated_features:
        for feature2 in highly_correlated_features:
            if feature1 != feature2:
                # Multiply highly correlated features to create interaction features
                interaction_feature_name = f"{feature1}_x_{feature2}"
                df[interaction_feature_name] = df0[feature1] * df0[feature2]
                
                # Divide highly correlated features (avoid division by zero)
                non_zero_denominator = df0[feature2].apply(lambda x: x if x != 0 else 1e-6)
                interaction_feature_name = f"{feature1}_div_{feature2}"
                df[interaction_feature_name] = df0[feature1] / non_zero_denominator
    return df
    

# Extract features
X = train.copy().drop(['defects'],axis=1)   # Features
y = train.copy()['defects'].astype(int)     # Label

X_test = test.copy().drop('id',axis=1)            # Features & label  


X = generate_additional_features(X)
X_test  = generate_additional_features(X_test)

X.head(3)


Number of highly correlated features:  5
Number of highly correlated features:  5


Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,total_Opnd_x_branchCount,total_Opnd_div_branchCount,branchCount_x_v,branchCount_div_v,branchCount_x_b,branchCount_div_b,branchCount_x_total_Op,branchCount_div_total_Op,branchCount_x_total_Opnd,branchCount_div_total_Opnd
0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,5448.79,...,110.0,4.4,1393.15,0.017945,0.45,55.555556,190.0,0.131579,110.0,0.227273
1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,936.71,...,42.0,4.666667,453.81,0.019832,0.15,60.0,54.0,0.166667,42.0,0.214286
2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,1754.01,...,51.0,5.666667,592.95,0.015178,0.21,42.857143,84.0,0.107143,51.0,0.176471


### Model

In [38]:
if False:
    def objective(trial):

        params = {'boosting_type': 'gbdt', 
                  'colsample_bytree': trial.suggest_float('colsample_bytree',0.5,1.0), 
                  'learning_rate': 0.09273502446736404, 
                  'max_depth': -1, 
                  'min_child_samples': 20, 
                  'min_child_weight': 0.001, 
                  'min_split_gain': 0.0, 
                  'n_jobs': -1, 
                  'num_leaves': 10, 
                  'random_state': None, 
                  'reg_alpha': 0.8978200441138784,
                  'reg_lambda': 0.0020343781703193705, 
                  'subsample': 1.0, 
                  'subsample_for_bin': 200000, 
                  'subsample_freq': 0, 
                  'verbose': -1, 
                  'objective': 'binary', 
                  'metric': ['auc'], 
                  'num_iterations': 100, 
                  'early_stopping_round': None}

        nfolds = 10
        skfold = StratifiedKFold(n_splits=nfolds,shuffle=True,random_state=0)

        lgb_auc_score_avg = 0
        lgb_models = []
        for idx, (train_idx,val_idx) in enumerate(skfold.split(X,y)):
            train_X = X.iloc[train_idx]
            val_X = X.iloc[val_idx]
            train_y = y[train_idx]
            val_y = y[val_idx]

            lgb_model = LGBMClassifier(**params)
            lgb_model.fit(train_X,train_y)
            lgb_prediction = lgb_model.predict_proba(val_X)[:,1]

            lgb_auc_score = roc_auc_score(val_y, lgb_prediction)

            lgb_auc_score_avg += lgb_auc_score
            lgb_models.append(lgb_model)

        lgb_auc_score_avg /=nfolds
        print(f'The averaged AUC score evaluated on the validation subset using LGB model:', lgb_auc_score_avg)

        return -lgb_auc_score_avg

    study = optuna.create_study()
    study.optimize(objective,n_trials=100)
    print(study.best_trial.params)

In [39]:
params = {'boosting_type': 'gbdt', 
          'colsample_bytree': 1.0, 
          'learning_rate': 0.09273502446736404, 
          'max_depth': -1, 
          'min_child_samples': 20, 
          'min_child_weight': 0.001, 
          'min_split_gain': 0.0, 
          'n_jobs': -1, 
          'num_leaves': 10, 
          'random_state': None, 
          'reg_alpha': 0.8978200441138784,
          'reg_lambda': 0.0020343781703193705, 
          'subsample': 1.0, 
          'subsample_for_bin': 200000, 
          'subsample_freq': 0, 
          'verbose': -1, 
          'objective': 'binary', 
          'metric': ['auc'], 
          'num_iterations': 100, 
          'early_stopping_round': None}

nfolds = 10 # Group the training data into 10 subgroups (AKA folds)
skfold = StratifiedKFold(n_splits=nfolds,shuffle=True,random_state=0)

lgb_auc_score_avg = 0
lgb_models = []


for idx, (train_idx, val_idx) in enumerate(skfold.split(X,y)): # For each fold
    # Prepare training and validation data
    train_X = X.iloc[train_idx]
    val_X   = X.iloc[val_idx]
    train_y = y[train_idx]
    val_y   = y[val_idx]

    # Create & train model
    lgb_model = LGBMClassifier(**params) # Gradient Boost model
    lgb_model.fit(train_X, train_y)

    # Validate model
    lgb_prediction = lgb_model.predict_proba(val_X)[:,1]
    lgb_auc_score = roc_auc_score(val_y, lgb_prediction)
    print(f'The AUC (area under curve) score evaluated on the validation, fold {idx}: {lgb_auc_score}')
    
    #Save results
    lgb_auc_score_avg += lgb_auc_score
    lgb_models.append(lgb_model)

lgb_auc_score_avg /=nfolds
print(f'The averaged AUC score evaluated on the validation subset using LGB model: {lgb_auc_score_avg}')

The AUC (area under curve) score evaluated on the validation, fold 0: 0.7911372278633726
The AUC (area under curve) score evaluated on the validation, fold 1: 0.7920725834035451
The AUC (area under curve) score evaluated on the validation, fold 2: 0.7837621765702453
The AUC (area under curve) score evaluated on the validation, fold 3: 0.7843440772099073
The AUC (area under curve) score evaluated on the validation, fold 4: 0.778610618916663
The AUC (area under curve) score evaluated on the validation, fold 5: 0.7821216248709223
The AUC (area under curve) score evaluated on the validation, fold 6: 0.7854676764871444
The AUC (area under curve) score evaluated on the validation, fold 7: 0.7817232310278882
The AUC (area under curve) score evaluated on the validation, fold 8: 0.7866729232914196
The AUC (area under curve) score evaluated on the validation, fold 9: 0.7835961410853349
The averaged AUC score evaluated on the validation subset using LGB model: 0.7849508280726443


### Prediction for the test set

In [40]:
prediction = np.zeros(len(X_test))

for lgb_model in lgb_models:
    prediction += lgb_model.predict_proba(X_test)[:,1]
    
prediction /= len(lgb_models)

In [41]:
# Save predicted results to the competition submission format.
submission = pd.DataFrame({'id':test.id, 'defects':prediction})
submission.to_csv('submission.csv',index=False)
submission.head(3)

Unnamed: 0,id,defects
0,101763,0.249872
1,101764,0.187552
2,101765,0.651635
