In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LassoCV
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from vecstack import stacking

In [None]:
!pip install lightgbm

In [None]:
!pip install catboost

In [None]:
#reading training dataset
df = pd.read_csv("train.csv")

#statistic information of dataset
df.head()
df.describe().T

#checking missing data
df.isnull().sum()

#spliting dependent and independent variables
y = df["target"]
X = df.drop(["target"], axis= 1)

#spliting test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

#scaling the dataset
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
rf=RandomForestClassifier()

In [None]:
#write possible best parameters to search
rf_params = {"n_estimators":[100,500,1000],
             "max_features":[30,50,103],
             "min_samples_split":[5,18,40]}

In [None]:
rf_cv_model = GridSearchCV(rf, rf_params, cv=5, n_jobs=-1, verbose=True).fit(X_train, y_train)

In [None]:
rf_cv_model.best_params_

In [None]:
#use best parameters to boost performance
rf_tuned = RandomForestClassifier(max_features= 20, min_samples_split= 3, n_estimators= 200, n_jobs=-1, verbose=True).fit(X_train,y_train)

In [None]:
y_pred = rf_tuned.predict_proba(X_test)

#measuring the error
log_loss(y_test, y_pred)
print(log_loss(y_test, y_pred))

In [None]:
xgb = XGBClassifier()
xgb_params = {"n_estimators": [100, 500],
              "subsample": [0.6, 0.8],
              "max_depth": [30,50],
              "learning_rate": [0.01, 0.001]} 

xgb_cv_model = GridSearchCV(xgb, xgb_params, n_jobs=-1, verbose=2).fit(X_train, y_train)

In [None]:
xgb_cv_model.best_params_

In [None]:
#use best parameters to boost performance
xgb_tuned = XGBClassifier(n_estimators= 100, subsample=0.6, max_depth=30, learning_rate=0.01, n_jobs=-1, verbose=True).fit(X_train,y_train)

In [None]:
y_pred = xgb_tuned.predict_proba(X_test)

#measuring the error
log_loss(y_test, y_pred)
print(log_loss(y_test, y_pred))