In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<h1 style='color:white;background-color:black' > Table of Contents </h1>

* [Introduction](#introduction)
* [Exploratory Data Analysis (EDA)](#eda)
    - [Data Acquisition](#data_acquisition)
    - [Data Understanding](#data_understanding)
    - [Data Correlation](#data_correlation)
* [Data Splitting](#data_splitting)
* [Model](#model)
    - [Hyperparameters Tuning](#hyper-parameters_tuning)
    - [LGBM Model](#lgbm_model)
    - [RandomSearchCV](#randomsearchcv)
* [Evaluation](#evaluation)
* [Make Submission](#make_submission)

<a id="introduction"></a>
## Introduction

The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the biological response of molecules given various chemical properties. Although the features are anonymized, they have properties relating to real-world features.

Submissions are evaluated on area under the **ROC Curve** between the predicted probability and target.

<a id="eda"></a>
## 2. EDA

<a id="data_acquisition"></a>
### 2.1 Data Acquisition

#### Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datatable as dt

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold

# Importing modelling packages
from lightgbm import LGBMClassifier


# Removes warning
import warnings
warnings.filterwarnings('ignore')

#### Load the Dataset

In [None]:
# Using datatable for faster loading

train_df = dt.fread(r'/kaggle/input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test_df = dt.fread(r'/kaggle/input/tabular-playground-series-oct-2021/test.csv').to_pandas()

print("Data is loaded")

## Memory Reduction

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Reduce Memory Usage
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

<a id="data_understanding"></a>
### 2.2 Data Understanding

**Data Size**
* Train dataset has 1000000 rows and 286 featurse which include target variable.
* Test dataset has 5000000 rows and 286 featurse which is **not** include target variable.

**Missing Values**
* There is no missing value found in train and test dataset.

**Total Features**
* The number total of features for data type float is 240 and integer 46.

**Target Variable**
* The target output is represent in binary with (1 or 0)
* Target distribution is consider to be balanced so no need to apply oversampling or undersampling method.

In [None]:
train_df.head()

In [None]:
train_df.dtypes

In [None]:
train_df.describe()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
missing_train = train_df.isnull().sum().sum()
missing_test = test_df.isnull().sum().sum()
print('Total missing value in train dataset is:', missing_train)
print('Total missing value in test dataset is:', missing_test)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
print(train_df.dtypes.value_counts())

In [None]:
train_df['target'].value_counts()

<a id="data_splitting"></a>
## 3. Data Splitting

In [None]:
X = train_df.drop('target', axis=1)
y = train_df['target']

# freeing up some memory
del train_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

<a id="model"></a>
## 4. Model

<a id="hyper-parameters_tuning"></a>
### 4.1 Hyper-Parameters Tuning

In [None]:
# Define the hyperparameters.
fit_params = dict(early_stopping_rounds = 200,
                  eval_set = [(X_train, y_train), (X_test, y_test)], 
                  eval_metric = 'auc', 
                  verbose = 200)

rs_params = dict(learning_rate = [0.05],
                 reg_lambda = [0, 20],
                 n_estimators = [5000],
                 max_depth = [7, 10],
                 subsample = [0.8, 0.9],
                 colsample_bytree = [0.8, 0.9],
                 reg_alpha = [20, 40])

<a id="lgbm_model"></a>
### 4.2 LGBM Model

In [None]:
lgb = LGBMClassifier(random_state = 42)

<a id="randomsearchcv"></a>
### 4.3 RandomSearchCV

In [None]:
rs_lgb = RandomizedSearchCV(estimator = lgb, 
                            param_distributions = rs_params,
                            scoring = 'roc_auc', 
                            cv = 2,
                            n_iter = 1,
                            random_state = 34)

In [None]:
# Train the model with given hyperparameters and train data
rs_lgb.fit(X_train, y_train, **fit_params)

In [None]:
best_accuracy = rs_lgb.best_score_
print('Best AUC score in train data: {:.2f} %'.format(best_accuracy*100))

In [None]:
best_params = rs_lgb.best_params_
print('Best params for the model are:', best_params)

In [None]:
final_model = rs_lgb.best_estimator_
final_model

<a id="evaluation"></a>
## 5. Evaluation

In [None]:
#Predict_proba will give the only probability of 1.
pred_lgbm = final_model.predict_proba(X_test)[:, -1]

In [None]:
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, pred_lgbm)
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
test_score = roc_auc_score(y_test,pred_lgbm)
print('AUC score for test data: {:.2f} %'.format(test_score*100))

<a id="make_submission"></a>
## 6. Make Submission

In [None]:
y_pred = final_model.predict_proba(test_df)[:, -1]
y_pred

In [None]:
# Submit
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': y_pred
})
submission.to_csv('submission.csv', index=False)
print("predictions successfully submitted")

In [None]:
submission