In [None]:
!pip install -U kaggle xgboost lightgbm catboost imbalanced-learn scikit-learn scikit-plot sweetviz

In [None]:
import pandas as pd

df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col='id')

print(df.shape)


In [None]:
df.target.value_counts() / df.shape[0]

In [None]:
import numpy as np

df['mean'] = df.mean(axis=1)

        

In [None]:
df.describe()

In [None]:
import numpy as np
import pandas as pd

from sklearn import compose as comp
from sklearn import preprocessing as prep
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
from sklearn.feature_selection import SelectKBest, mutual_info_classif

from imblearn.pipeline import Pipeline
from imblearn import over_sampling as over
from imblearn import under_sampling as under


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    #('normalize', prep.Normalizer())
])

categorical_transformer_ord = Pipeline(steps=[
    ('imputer_nan', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')),
    ('imputer_none', SimpleImputer(missing_values=None, strategy='constant', fill_value='missing')),
    ('ordinal', prep.OrdinalEncoder(handle_unknown='ignore')),
])

preprocessor_ord = comp.ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="object")),
    ('cat', categorical_transformer_ord, selector(dtype_include="object"))
])

X = df.drop(['target'], axis=1)
y = df.target

X_ord = preprocessor_ord.fit_transform(X)

In [None]:
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.model_selection import train_test_split

X_train,X_val,y_train,y_val = train_test_split(
    X_ord, y,
    test_size = 0.3, 
    random_state = 42,
    stratify=y
)

model = LGBMClassifier(n_jobs=-1, class_weight='balanced')
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_val, model.predict(X_val)))
print(roc_auc_score(y_val, model.predict_proba(X_val)[:,1], average="micro"))
print(roc_auc_score(y_train, model.predict_proba(X_train)[:,1], average="micro"))

In [None]:
import numpy as np

def preprocess(df):
    df['mean'] = df.mean(axis=1)
    return df

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
X_test = df_test.drop(['id'], axis=1)
X_test = preprocess(X_test)
X_test = preprocessor_ord.transform(X_test)
df_test['target'] = model.predict_proba(X_test)[:,1]
sub = df_test[['id', 'target']]
sub.head()
sub.to_csv('submission.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,X.columns))[-10:], columns=['Value','Feature'])

plt.figure(figsize=(15, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')

plt.show()

In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt

y_probas = model.predict_proba(X_val)
skplt.metrics.plot_roc_curve(y_val, y_probas)
plt.show()

In [None]:
import sweetviz as sv

df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col='id')
df_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col='id')

report = sv.compare([df, "Training Data"], [df_test, "Test Data"])
report.show_notebook()