In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

## Data Exploration

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
train.sample(3)

In [None]:
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
test.sample(3)

#### Train DataFrame information

In [None]:
train.info()

#### DataFrame statistics

In [None]:
describeTable = train.describe().drop(index=['25%', '50%', '75%'])
describeTable

#### Look for missing values

In [None]:
train.isna().sum().plot(kind='line',color='black', figsize=(27,5));

In [None]:
if sum(train.isna().sum()) == 0:
    print('No Missing Values Found in data frame')

#### Observe the distributions of all features (and target)

In [None]:
train.drop(columns=['id']).hist(figsize=(27,15), bins=60, grid=False, color='black');

#### Observe the means of all features 

In [None]:
features_means = describeTable.drop(columns = ['id', 'target']).T['mean']
plt.xticks(np.arange(0,len(train.columns)-2, step=2))
plt.xlabel('Feature')
plt.ylabel('mean')
features_means.plot(color='black', figsize=(27,5));

##### Comparing to all 100 features - it seems thath feature number 2 and feature number 35 contain unusual distribution of values.
##### What it will look like if I'll ignore them?

In [None]:
features_means = train.describe().drop(columns = ['id', 'target', 'f2', 'f35']).T['mean']
plt.xticks(np.arange(0,len(train.columns)-2, step=2))
plt.xlabel('Feature')
plt.ylabel('Mean')
features_means.plot(color='black', figsize=(27,5));

In [None]:
HI_means = features_means[features_means.values >= 0.5].index.to_list()
LOW_means = features_means[features_means.values < 0.5].index.to_list()
print('There are',len(HI_means), 'features with a mean >= 0.5 and', len(LOW_means) , 'features with a mean < 0.5')

#### Look for correlation between features

In [None]:
import plotly.io as pio
pio.templates.default = "plotly_white"


corr_df = train.drop(columns=['id']).corr()
mask = np.triu(np.ones_like(corr_df, dtype=bool))

correlogramMap = go.Heatmap(z=corr_df.mask(mask),
                            x=corr_df.columns,
                            y=corr_df.columns,
                            colorscale = 'RdBu',
                            xgap=0.5, ygap=0.5,
                            colorbar_thickness=20,
                            colorbar_ticklen=3)

correlogramLayout = go.Layout(title_text='Correlation Matrix',
                   title_x=0.5,
                   width=1750, height=1000,
                   xaxis_showgrid=False,
                   yaxis_showgrid=False,
                   yaxis_autorange='reversed')

correlogram_fig = go.Figure(data=[correlogramMap], layout=correlogramLayout)

correlogram_fig.show() 

Zoom to correlation with target column

In [None]:
TargetCorrel = corr_df[['target']].drop(index=['target'])

TargetCorrel_fig = go.Figure(data=go.Line(x=TargetCorrel.index, y=TargetCorrel['target'], mode='lines+markers', line_color='orange'))
TargetCorrel_fig.update_traces(marker_color = 'black')
TargetCorrel_fig.update_layout(plot_bgcolor="white")

TargetCorrel_fig.show()

### Negative vs. Positive values

In [None]:
all_in_percent = train.drop(columns = ['id', 'target']).shape[0] * train.drop(columns = ['id', 'target']).shape[1]
trainNegative = train.drop(columns = ['id', 'target'])[(train < 0.0)]
trainPositive = train.drop(columns = ['id', 'target'])[(train > 0.0)]
trainZero =  train.drop(columns = ['id', 'target'])[(train == 0.0)]

print('There are\n',round(trainNegative.count().sum()* 100/all_in_percent,4), '% negative records,\n' ,
      round(trainPositive.count().sum()* 100/all_in_percent,4), '% positive records,\n',
      round(trainZero.count().sum()* 100/all_in_percent,4), '% zero-values records')

In [None]:
negVSposFig = go.Figure()
negVSposFig.add_trace(go.Bar(x=trainNegative.columns, y=trainNegative.count().values,
                    name='Negative Values'))
negVSposFig.add_trace(go.Bar(x=trainPositive.columns, y=trainPositive.count().values,
                    name='Positive Values'))

negVSposFig.show()

# Data Preprocessing

Feature scaling - is it necessery ?

In [None]:
features = train.drop(columns=['id','target']).columns.to_list()

from sklearn.preprocessing import StandardScaler
                   
scaling = StandardScaler().fit_transform(train[features])
scaled_train = train.copy()
scaled_train[features] = scaling
scaled_train.head(3)

Features reduction

In [None]:
from sklearn.feature_selection import VarianceThreshold

features = train.drop(columns=['id','target']).columns.to_list()
selector = VarianceThreshold(threshold  = 0.25)
selector.fit(X=train[features])
selected_features = train[features].columns[selector.get_support()].to_list()

print(len(selected_features) ,
      'out of',
      len(train.columns)-2,
      'features were seleced')

In [None]:
X = train[selected_features]
y = train['target']

Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=12)

# Models 

In [None]:
from time import time
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

def ModelResults_df(model, X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test):
    """
    A simple workflow for model training and it's results.
    The function return pandas DataFrame containing results as a report for the model.
    """
    
    startTime = time()
    ReportDict = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    endTime = time()
    
    modelAccuracy = round(accuracy_score(y_test, y_pred),5)
    modelROC_AUC = round(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]),5)
    modelRecall = round(recall_score(y_test, y_pred),5)
    
    ReportDict['Training Time (sec)'] = np.round(endTime-startTime, 4)
    ReportDict['Accuracy Score'] = modelAccuracy
    ReportDict['ROC AUC score'] = modelROC_AUC
    ReportDict['Recall score'] = modelRecall
    
    return pd.DataFrame(ReportDict, index=[0])

from xgboost import XGBClassifier

xbgClassifierReport = ModelResults_df(model = XGBClassifier(n_estimators=1000,
                                                           max_depth=10))
xbgClassifierReport

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

NNClassifier = Sequential([
    Dense(units = 4, activation = 'relu'),
    Dense(units = 16, activation = 'relu'),
    Dense(units = 32, activation = 'relu'),
    Dense(units = 16, activation = 'relu'),
    Dense(units = 1, activation = 'sigmoid')])
NNClassifier.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

NNClassifier.fit(X_train, y_train, epochs=15, batch_size=15,verbose=1)

In [None]:
NNClassifier.summary()

In [None]:

_ , accuracy = NNClassifier.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy*100))

y_pred = np.round((NNClassifier.predict(X_test)))

In [None]:
cm = confusion_matrix(y_test, y_pred)
labels = ['Mail','Spam']
ConfusionMatrixDisplay(cm, display_labels=labels).plot(cmap='Reds');

# Submission File

In [None]:
test['target'] = np.rint(NNClassifier.predict(test.drop(columns='id')[selected_features])).astype('int')

In [None]:
test.head()

In [None]:
test['target'].value_counts()

In [None]:
subFile = test[['id', 'target']].to_csv('submission.csv' , index=False)