In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
import seaborn as sns

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_df = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

## EDA

In [None]:
train_df.describe()

In [None]:
train_df.isna().sum().describe()

In [None]:
X, y = train_df.iloc[:, 1:-1], train_df.iloc[:, -1]
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
results = []
for n_params in range(1, 100):
    pipe = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler(),
        SelectKBest(f_classif, k=n_params),
        LogisticRegression(random_state=33)
    )
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    results.append([n_params, score])

In [None]:
sns.scatterplot(x=[i[0] for i in results], y=[i[1] for i in results])

In [None]:
n_params = 60
mpipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    SelectKBest(f_classif, k=n_params),
    MLPClassifier(random_state=33, early_stopping=True)
)
pipe.fit(X_train, y_train)
print(f"score for {n_params} best params: {pipe.score(X_test, y_test)}")

In [None]:
n_params = 60
pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    SelectKBest(f_classif, k=n_params),
    GradientBoostingClassifier(n_estimators=1000, learning_rate=0.001, max_depth=4, random_state=33)
)
pipe.fit(X_train, y_train)
print(f"score for {n_params} best params: {pipe.score(X_test, y_test)}")