In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.read_csv('../input/higgs-boson/random_submission.zip').head(2)

In [None]:
train_df = pd.read_csv('../input/higgs-boson/training.zip')
test_df = pd.read_csv('../input/higgs-boson/test.zip')

print(train_df.shape,test_df.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder,normalize,MinMaxScaler
LE = LabelEncoder()

train_df['Label'] = LE.fit_transform(train_df['Label'])
train_df.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier

import optuna
from functools import partial
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df.columns

In [None]:
X_train = train_df.drop(['EventId', 'Weight','Label'], axis=1)
y_train = train_df.Label

In [None]:
def objective(trial, X, y, name='higgs'):
        
    params = {'max_depth':trial.suggest_int('max_depth', 5, 50),
              'n_estimators':200000,
              'subsample': trial.suggest_uniform('subsample', 0.2, 1.0),
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.2, 1.0),
              'learning_rate':trial.suggest_uniform('learning_rate', 0.007, 0.02),
              'reg_lambda':trial.suggest_uniform('reg_lambda', 0.01, 50),
              'reg_alpha':trial.suggest_uniform('reg_alpha', 0.01, 50),
              'min_child_samples':trial.suggest_int('min_child_samples', 5, 100),
              'num_leaves':trial.suggest_int('num_leaves', 10, 200),
              'n_jobs' : -1,
              'metric':'binary_logloss',
              'max_bin':trial.suggest_int('max_bin', 300, 1000),
              'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
              'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100)}

    model = LGBMClassifier(objective="binary", **params)
                  
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              eval_metric=['binary_logloss'],
              early_stopping_rounds=300, 
              #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
              verbose=1)

                  
    return log_loss(y_val,model.predict(X_val))

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial,X=X_train, y=y_train)
study.optimize(func, n_trials=20)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
lgbm_params = { 'max_depth': 9,
    'subsample': 0.7730780498269871,
    'colsample_bytree': 0.6387676696977731,
    'learning_rate': 0.0123564377640701,
    'reg_lambda': 46.29907650439835,
    'reg_alpha': 16.959124157211022,
    'min_child_samples': 7,
    'num_leaves': 115,
    'max_bin': 523,
    'cat_smooth': 49,
    'cat_l2': 30.182489403995866}

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

In [None]:
model = LGBMClassifier(objective="binary", **lgbm_params,n_estimators=10000)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)],eval_metric=['binary_logloss'],early_stopping_rounds=300, verbose=1)

In [None]:
X_test = test_df.drop(['EventId'], axis=1)

In [None]:
pred = model.predict(X_test)
pred

In [None]:
inv_y_test = LE.inverse_transform(pred)
inv_y_test

In [None]:
subm=pd.read_csv("../input/higgs-boson/random_submission.zip")
subm.head(2)

In [None]:
subm.Class = inv_y_test

In [None]:
subm['RankOrder'] = subm['Class'].argsort().argsort() + 1 

In [None]:
subm.to_csv("higgssubm4.csv",index=False)