In [None]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

In [None]:
# all rows, all cols but the first "ids" and last one "dependent variable" 
X_train = df_train.iloc[:,1:-1].copy()

# all rows, the column of the dependent variable only
y_train = df_train.iloc[:, -1].copy()


X_test = df_test.copy()
X_test.drop('id', axis = 1, inplace = True)

#del df_train,df_test

### Checking the data distribution
I took a random sample (10 features) to look at how the data is distributed. 
What is the point?
Well... since we are trying out Naive Bayes in this problem. Distribution matters. 
Naive Bayes gives best results when data is normaly distributed. Since the plots show that 
the data is not normaly distributed, we will need to do some normalization.

A better approach can be seen in this notebook: https://www.kaggle.com/raahulsaxena/tps-nov-21-extracting-the-power-of-naive-bayes
by trying all the possible normalizations {none, MinMax, Standard, Robust, Quantile} with pipelines and comparing them. Simply because randomly plotting 10 features might not be a good representation of the data in every iteration.

In [None]:
# plt.figure()
# fig, ax = plt.subplots(5, 2,figsize=(20, 22))
# for i in range(1, 11):
#     plt.subplot(5, 2,i)
#     sns.histplot(data=X_train.iloc[:,random.randint(0,100)])
# plt.show()

In [None]:
percentile = [i for i in range(10,101,10)]
for p in percentile:
    pipe = Pipeline(
        steps =[('scaler', QuantileTransformer()),
                ('feature_selection', SelectPercentile(score_func = f_classif, percentile=p)),
                ('nb', GaussianNB())])
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
    pipe.fit(X_train_split, y_train_split)
    print(f"roc_auc_score with {p}% of the features: ",roc_auc_score(y_test_split,pipe.predict_proba(X_test_split)[:,1]))


In [None]:
pipe = Pipeline(
        steps =[('scaler', QuantileTransformer()),
                ('feature_selection', SelectPercentile(score_func = f_classif, percentile=90)),
                ('nb', GaussianNB())])
pipe.fit(X_train,y_train)
y_pred = pipe.predict_proba(X_test)[:,1]

In [None]:
output = pd.DataFrame({'id': df_test.id, 'target' : y_pred})
output.to_csv('submission.csv', index=False)
print("submission saved")