In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

from sklearn.compose import ColumnTransformer

from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from xgboost import plot_importance
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv")
pd.set_option('display.max_columns', None)
train_df.head()

In [None]:
train_df.shape

In [None]:
ax = sns.countplot(x="target", data=train_df)

In [None]:
filter = train_df['target'] == 1
positive = train_df[filter]['target'].count()
filter = train_df['target'] == 0
negative = train_df[filter]['target'].count()
negative / positive

In [None]:
y = train_df.pop('target')

In [None]:
features_to_drop = ['id']

In [None]:
train_df.columns[train_df.isna().any()].tolist()

In [None]:
X = train_df.drop(columns=features_to_drop)

In [None]:
X.describe()

In [None]:
X.hist(figsize=(16, 20), bins=100, xlabelsize=8, ylabelsize=8)

In [None]:
bimodal_features = ['f1','f3','f5','f6','f7','f8','f10','f11','f13','f14','f15','f17','f18','f22','f25','f26','f29','f34','f37','f38','f40','f41','f43','f45','f47','f50','f54','f55','f57','f65','f66','f67','f70','f71','f74','f77','f80','f82','f85','f86','f91','f96','f97']
normal_features = ['f2','f12','f21','f27','f28','f36','f39','f44','f52','f53','f73','f87','f95']
additional_features_to_drop = ['f9','f16','f19','f20','f23','f31','f42','f46','f48','f49','f51','f58','f59','f64','f69','f75','f76','f78','f84','f88','f89','f90','f92','f93','f94','f98','f99']
#additional_features_to_drop = []
other_features = np.array(X.columns.tolist())

other_features = np.setdiff1d(X.columns, bimodal_features)
other_features = np.setdiff1d(other_features, normal_features)
other_features = np.setdiff1d(other_features, additional_features_to_drop)
other_features

In [None]:
X = X.drop(columns=additional_features_to_drop)

In [None]:
corr = X.loc[:, X.columns].corr().abs()
threshold = 0.5

filtered_corr = corr[(corr >= threshold)] 

fig = plt.figure(figsize=(100, 100))
sns.heatmap(filtered_corr, annot=True, cmap="Reds")
plt.show()

In [None]:
bimodal_transformer = Pipeline(steps=[
        #('standardization', StandardScaler()),
        #('normalization', MinMaxScaler()),
        ('qt', QuantileTransformer(n_quantiles=10, output_distribution="normal"))
    ])

normal_transformer = Pipeline(steps=[
        ('standardization', StandardScaler()),
        ('normalization', MinMaxScaler())
    ])

other_transformer = Pipeline(steps=[
        #('standardization', StandardScaler()),
        #('normalization', MinMaxScaler()),
        ('pt', PowerTransformer(method="yeo-johnson"))
    ])

preprocessor = ColumnTransformer(
    transformers = [
        ("bimodal_transformer", bimodal_transformer, bimodal_features),
        ("normal_transformer", normal_transformer, normal_features),
        ("other_transformer", other_transformer, other_features),
    ]
)

pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=2000))

In [None]:
set_config(display="diagram")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0, train_size=0.9, stratify=y)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
print("model score: %.3f" % pipe.score(X_test, y_test))

In [None]:
plot_confusion_matrix(pipe, X_test, y_test)

In [None]:
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv")
test_df

In [None]:
X_test = test_df.drop(columns=features_to_drop)
X_test = X_test.drop(columns=additional_features_to_drop)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
output = pd.DataFrame({'id': test_df.id, 'target': y_pred})
print(output.head(20))
output.to_csv('logistic_regression_submission.csv', index=False)