In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier, plot_tree
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")

In [None]:
train.head()

In [None]:
# Extract the sample
train_sample = train.sample(frac = 0.1, random_state=42)
validation_sample = train.sample(frac = 0.05, random_state=50)
test_sample = train.sample(frac = 0.05, random_state=60)

In [None]:
train_sample['id'].value_counts()

**id column is a unique identifier, it is better to drop it.**

In [None]:
# drop the id column
train_sample.drop('id', axis=1, inplace=True)

In [None]:
train_sample.head()

In [None]:
plt.figure(figsize=(25, 25))
features = train_sample.drop('target', axis=1).columns
for i, feature in enumerate(features):
    plt.subplot(10, 10, i+1)
    sns.distplot(train_sample[feature][train_sample['target'] == 0], color='red', label = 'class 0')
    sns.distplot(train_sample[feature][train_sample['target'] == 1], color='green', label = 'class 1')
    plt.legend()
plt.show()

In [None]:
# Scale the data
ss = StandardScaler()

X_train = train_sample.drop('target', axis=1)
y_train = train_sample['target']

X_val = validation_sample.drop(['id','target'], axis=1)
y_val = validation_sample['target']

X_test = test_sample.drop(['id','target'], axis=1)
y_test = test_sample['target']


ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_val_scaled = ss.transform(X_val)
X_test_scaled = ss.transform(X_test)

In [None]:
rfe = RFE(estimator=XGBClassifier(), verbose=5)
rfe.fit(X_train_scaled, y_train)

In [None]:
rfe.get_support()

In [None]:
selected_features = X_train.columns[rfe.get_support()]
print(selected_features)

In [None]:
print("Total number of features selected are", len(selected_features))

**We are able to select top 50 features using RFE**

In [None]:
# Prepare the final dataset

train_final = X_train[selected_features]
val_final = X_val[selected_features]
test_final = X_test[selected_features]

# Scale the data
ss = StandardScaler()

train_final = ss.fit_transform(train_final)
val_final = ss.transform(val_final)
test_final = ss.transform(test_final)

In [None]:
# Build a linear model
lr = LogisticRegression(solver='liblinear')
lr.fit(train_final, y_train)

In [None]:
# Validation results
val_predictions = lr.predict_proba(val_final)
val_auc_score = roc_auc_score(y_val, val_predictions[ : , 1])
print("Validation AUC score {}".format(val_auc_score))

In [None]:
# Test Results
test_predictions = lr.predict_proba(test_final)
test_auc_score = roc_auc_score(y_test, test_predictions[ : , 1])
print("Testing AUC score {}".format(test_auc_score))

In [None]:
# Building a Bagging Model
ext = ExtraTreesClassifier()
ext.fit(train_final, y_train)

In [None]:
# Validation results
val_predictions = ext.predict_proba(val_final)
val_auc_score = roc_auc_score(y_val, val_predictions[ : , 1])
print("Validation AUC score {}".format(val_auc_score))

In [None]:
# Test Results

test_predictions = ext.predict_proba(test_final)
test_auc_score = roc_auc_score(y_test, test_predictions[ : , 1])
print("Testing AUC score {}".format(test_auc_score))

In [None]:
# Building a Boosting Model
xgb = XGBClassifier()
xgb.fit(train_final, y_train)

In [None]:
# Validation results
val_predictions = xgb.predict_proba(val_final)
val_auc_score = roc_auc_score(y_val, val_predictions[ : , 1])
print("Validation AUC score {}".format(val_auc_score))

In [None]:
# Test Results
test_predictions = xgb.predict_proba(test_final)
test_auc_score = roc_auc_score(y_test, test_predictions[ : , 1])
print("Testing AUC score {}".format(test_auc_score))

**Bagging Model seems to perform best**

# Train on the entire data and Making Submissions

In [None]:
# Train the model
final_data = ss.transform(train[selected_features])
target = train['target']
ext.fit(final_data, target)

In [None]:
# Make Predictions
test_final = ss.transform(test[selected_features])
test_id = test['id']

ext_preds = ext.predict_proba(test_final)
submission_arr = np.concatenate((test_id.values.reshape(-1,1), ext_preds[ : ,1].reshape(-1,1)), axis=1)
submission_df = pd.DataFrame(submission_arr, columns=['id', 'target'])
submission_df['id'] = submission_df['id'].astype('int')
# cnvt to csv
submission_df.to_csv('./ext_submissions.csv', index=False)

In [None]:
submission_df.head()