In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import cufflinks as cf

import plotly.express as px
%matplotlib inline


import os
import random
import time
from datetime import datetime
import gc
import warnings

from tqdm.notebook import tqdm

from sklearn.ensemble import RandomTreesEmbedding
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, QuantileTransformer

import tensorflow as tf
from keras import Sequential
from keras import backend as K
from keras.layers import Dense,Dropout,BatchNormalization,LeakyReLU,Activation
from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
import tensorflow_addons as tfa

warnings.filterwarnings("ignore")


In [None]:
train=pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
test=pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")

**pie chart for target value**

1. target loss is dominated by value 0
2. the classes are unbalanced

if anyone has idea how to balance the data set please comment 

note - i have tried to reduce number of values in loss but score increases.


In [None]:
df=pd.DataFrame(train.groupby("loss").count().f0.reset_index())
px.pie(df, values='f0', names='loss', 
       title='share of loss values in dataset', 
       color_discrete_sequence=px.colors.sequential.RdBu)



# there are some outliers in loss

41 and 42 have extremly low rows in dataset




In [None]:
sns.displot(train['loss'],stat = 'density',binwidth = 3,bins = 11,kde = True)





**most of the values for loss lies from 0 to 12**

lets compare loss with f1 






In [None]:
fig = px.density_heatmap(train, x='loss', y='f0', z='f1', 
                         marginal_x="histogram", marginal_y="histogram")
fig

In [None]:
import random
colors = [ "red", "lightblue", "lightgreen", "deeppink", "purple", "orange", "black","pink","deepskyblue" ]
list=train.columns
train[list[2]].shape

sns.color_palette()

**EDA's are very interesting dont know what new could be found**

go through the scatterplots 

column f21 and f86 are not following pattern as showed by other columns

if you do corelation analysis, you will find out there is no corelation
i mean its in range of 0.0005 - 0.00010 doesnt makes any sense.

the columns doesnt explain target variable

In [None]:
fig, axes = plt.subplots(25, 4, figsize=(20, 90))


for i in range(100):
 sns.scatterplot(ax=axes[i//4,i%4],x=train[list[i]], y=train['loss'],color=random.choice(colors))
 


In [None]:
sns.lineplot( x=train['f1'], y=train['loss'],color='darkred')

In [None]:
sns.lineplot( x=train['f86'], y=train['loss'],color='darkorange')

**+++training+++**

In [None]:
def scaling(X_train,X_test):
    
    df = pd.concat([X_train,X_test],axis=0,copy=False).reset_index(drop=True)    
    #scaling
    scaler = StandardScaler()
    df = scaler.fit_transform(df)
    
    #quantile transformation
    qt = QuantileTransformer(random_state=0, output_distribution='normal')
    df = qt.fit_transform(df)
    
    X_train = df[:len(X_train),:]
    X_test = df[len(X_train):,:]
    del df
    gc.collect()
    
    return X_train,X_test

In [None]:

X=train.drop(['loss','id'],axis=1)
y=train['loss']
test=test.drop('id',axis=1)



In [None]:
X,test = scaling(X,test)

In [None]:

X.shape

In [None]:
import tensorflow.keras.backend as t
import tensorflow as tf
import math

In [None]:
def nn_model():
    model = Sequential()
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))
       
    return model

    
learning_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min') 
earlystopping = EarlyStopping(monitor='accuracy', patience=4) 
        
N_FOLDS = 5
SEED = 42
EPOCH = 50
N_round = 1

for i in range (N_round):
    
    oof = np.zeros((X.shape[0],1))
    pred = np.zeros((test.shape[0],1))

    skf = StratifiedKFold(n_splits=N_FOLDS, 
                          shuffle=True, 
                          random_state=SEED *i)

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X,y)):
        print(f"FOLD {fold} of round {i}")

        X_train,y_train  = X[train_idx],y[train_idx]
        X_valid,y_valid = X[valid_idx],y[valid_idx]
        model= nn_model()
        model.compile(loss='mse',metrics=['accuracy'],optimizer ='sgd')
        model.fit(X_train, y_train,batch_size = 128,epochs = EPOCH,validation_data=(X_valid, y_valid),
                  callbacks=[learning_loss, earlystopping],verbose = 1)

        pred_round = model.predict(X_valid) 
        oof[valid_idx] += pred_round
        score_NN_round = math.sqrt(mean_squared_error(y_valid, pred_round))
        print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_valid,pred_round ))))
        pred += model.predict(test) / N_FOLDS 
        t.clear_session()
     

    score_round = math.sqrt(mean_squared_error(y, oof))
    print(f"\n=== FINAL SCORE round {i} REGRESSION MODEL  : {score_round}===\n") 

In [None]:

sub=pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
sub['loss'] = pred
sub.to_csv('sub.csv', index = 0)