In [2]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None) 
import seaborn as sns

In [3]:
train = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv")
train

In [4]:
test = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv")
test_id = test.id

In [5]:
subm = pd.read_csv("../input/tabular-playground-series-aug-2022/sample_submission.csv")
subm

In [6]:
train['failure'].value_counts()

In [7]:
train.drop(['id'], inplace=True, axis=1)

In [8]:
train.describe()

In [10]:
categorical = ['attribute_0','product_code','attribute_1',"attribute_2","attribute_3"]
cate_train = train[categorical]
cate_train

In [13]:

from sklearn.preprocessing import OrdinalEncoder
encode1 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder = encode1.fit(cate_train)
encoded = encoder.transform(cate_train)
encoded

In [14]:
train.isna().sum()

In [15]:
nume_train = train[[i for i in train.columns if i not in categorical+['failure']]]
nume_train 

In [16]:
from sklearn.impute import SimpleImputer
imp_freq = SimpleImputer(strategy='most_frequent')
imp_freq_cate = imp_freq.fit(cate_train)
transformed_cate_train = imp_freq_cate.transform(cate_train)
transformed_cate_train

In [17]:
imp_mean = SimpleImputer(strategy='mean')
imp_mean_nume = imp_mean.fit(nume_train)
transformed_nume_train = imp_mean_nume.transform(nume_train)
transformed_nume_train

In [18]:
transformed_nume_train =  pd.DataFrame(transformed_nume_train, columns=nume_train.columns)
transformed_nume_train.head()

In [19]:
result = pd.concat([transformed_nume_train, pd.DataFrame(encoded,columns=cate_train.columns),pd.DataFrame(train['failure'])], axis=1)
result

In [20]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [21]:
X = result.drop('failure',axis=1)
y = result['failure']

In [22]:
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_resample(X,y)

In [23]:
del train, nume_train ,cate_train, transformed_cate_train, transformed_nume_train

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,auc
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import optuna

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X_res,y_res, test_size=0.9, random_state=0)

In [26]:
def objective(trial, X, y, name='higgs'):
        
    params = {

        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 5, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 80),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200)
    }
    model = LGBMClassifier(objective="binary", **params)
                  
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=0)
    

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              eval_metric=['auc'],
              early_stopping_rounds=500, 
              #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
              verbose=1)

                  
    return True

In [27]:
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial,X=X_train, y=y_train)
study.optimize(func, n_trials=50)

In [28]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [29]:
result

In [30]:
model = LGBMClassifier(objective="binary", **study.best_params,n_estimators=10000)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)],eval_metric=['auc'],early_stopping_rounds=500, verbose=1)

In [31]:
test.drop(['id'], inplace=True, axis=1)

In [33]:
cate_train = test[categorical]
cate_train

nume_train = test[[i for i in test.columns if i not in categorical]]
nume_train 

In [34]:
encoded = encoder.transform(cate_train)
transformed_cate_train = imp_freq_cate.transform(cate_train)
transformed_cate_train
transformed_nume_train = imp_mean_nume.transform(nume_train)
transformed_nume_train
transformed_nume_train =  pd.DataFrame(transformed_nume_train, columns=nume_train.columns)
transformed_nume_train.head()

In [35]:
result = pd.concat([transformed_nume_train, pd.DataFrame(encoded,columns=cate_train.columns)], axis=1)
result

In [36]:
pred = model.predict(result)

In [37]:
submission = pd.DataFrame(test_id)
submission['failure'] = pred

In [38]:
submission.to_csv('./submission.csv', index= None)