In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import sys
# !cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
# !cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
# sys.path = ["/opt/conda/envs/rapids/lib/python3.6/site-packages"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib/python3.6"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
# !cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
!nvidia-smi

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
df.head()

In [None]:
df.drop('row_id',axis=1, inplace=True)

In [None]:
df.describe()

In [None]:
# checking for missing values and nan values
for col in df.columns:
    nans = df[col].isnull().sum()
    print(col + ': ',nans)

In [None]:
targets = np.array(df['target'])
print(np.unique(targets))

In [None]:
df["target"].value_counts(normalize=True)

In [None]:
plt.figure(figsize = (15,8))
ax = sns.countplot(x='target', data=df)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)

In [None]:
#Visualizing first few rows
numerical = df.columns[df.dtypes != "object"].to_numpy()
fig = plt.figure(figsize=(20, 50))
rows, cols = 10, 4
for idx, num in enumerate(numerical[:40]):
    ax = fig.add_subplot(rows, cols, idx+1)
    ax.grid(alpha = 0.7, axis ="both")
    sns.kdeplot(x = num, fill = True,color ='#50B2C0',linewidth=0.6, data = df, label = "Train")
#     sns.kdeplot(x = num, fill = True,color ='#FF4000',linewidth=0.6, data = df_test, label = "Test")    
    ax.set_xlabel(num)
    ax.legend()
fig.tight_layout()
fig.show()

> **Most of the data seems to be head heavy, but standardisation should take care of this**

In [None]:
plt.figure(figsize = (20,15))
corr = df.corr()
sns.heatmap(corr, robust = True, center = 0,square = True)
plt.title('Correlation')
plt.show()

> **The correlation seems much crowded, let's see how we can lower the dimension for the data to make more sense**

**PCA Dimension Reduction**

In [None]:
y = df['target']
X = df.drop('target',axis=1)
X.head()

In [None]:
from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=100)
for X_batch in np.array_split(X, n_batches):
    print(".", end="") # not shown in the book
    inc_pca.partial_fit(X_batch)
    
X_reduced = inc_pca.transform(X)

In [None]:
sum(inc_pca.explained_variance_ratio_)

In [None]:
X_reduced

In [None]:
columns = ['feature_'+str(i) for i in range(1,101)]
X_redDf = pd.DataFrame(X_reduced, columns=columns)
X_redDf.head()

In [None]:
plt.figure(figsize = (20,15))
corr = X_redDf.corr()
sns.heatmap(corr, robust = True, center = 0,square = True)
plt.title('Correlation Reduced Data Frame')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_redDf)

In [None]:
X_redScaled = scaler.transform(X_redDf)
X_redScaled

In [None]:
X_redScaled = pd.DataFrame(X_redScaled, columns=columns)
X_redScaled.head()

> **Now the data is scaled and should Look Normalised, meaning shouldn't have heavy tails or head**

In [None]:
#Visualizing first few rows
numerical = X_redScaled.columns[X_redScaled.dtypes != "object"].to_numpy()
fig = plt.figure(figsize=(20, 50))
rows, cols = 10, 4
for idx, num in enumerate(numerical[:40]):
    ax = fig.add_subplot(rows, cols, idx+1)
    ax.grid(alpha = 0.7, axis ="both")
    sns.kdeplot(x = num, fill = True,color ='#50B2C0',linewidth=0.6, data = X_redScaled, label = "Train")
#     sns.kdeplot(x = num, fill = True,color ='#FF4000',linewidth=0.6, data = df_test, label = "Test")    
    ax.set_xlabel(num)
    ax.legend()
fig.tight_layout()
fig.show()

Well Looks great! You can compare with data without scaling 

> **Let's check for Duplicate Values Now in Processed DataSet**

In [None]:
len(X_redScaled)

In [None]:
X_redScaledDuped = X_redScaled.duplicated(keep = False)
len(X_redScaled)

In [None]:
X_redScaled[X_redScaledDuped]

In [None]:
#dropping_Dupliacted
full_data = X_redScaled.copy()
full_data['target'] = y

In [None]:
print('The length of data before dropping duplicates:',len(full_data))
old = len(full_data)
print('Dropping Duplicates.......')
full_data.drop_duplicates(inplace=True)
print('The length of data after dropping duplicates:',len(full_data))
print('Numbers of duplicates in the data: ',(old-len(full_data)))

**Duplicates dropped**

**Now Let's visualise the data to see the Outliers**

In [None]:
#Since the first two features preserve most variance! Let's choose that
plt.figure(figsize = (20,15))
sns.scatterplot(x='feature_1',y='feature_2',hue='target',data=full_data)

Pretty Obviously there are few Outliers, we'll fix them.

**Fixing Outliers**

In [None]:
#Let's see the boxplots
fig = plt.figure(figsize=(20, 50))
rows, cols = 10, 2
for idx in range(4):
    y = 'feature_'+str(idx+1)
    ax = fig.add_subplot(rows, cols, idx+1)
    ax.grid(alpha = 0.7, axis ="both")
    sns.boxplot(x="target",y=y,data=full_data)
#     sns.kdeplot(x = num, fill = True,color ='#FF4000',linewidth=0.6, data = df_test, label = "Test")    
    ax.set_xlabel(y, fontsize=14)
    ax.legend()
    plt.xticks(rotation=60, fontsize=14)
    plt.yticks(fontsize=14)
fig.tight_layout()
fig.show()
# plt.figure(figsize = (10,10))


Boxplot confirms about the outliers

**Removing Outliers Using IsolationForest**

In [None]:
target = full_data['target']
full_data.drop('target', axis=1, inplace=True)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    full_data, target, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

**We don't want the outlier removal on Test and validation set for more accurate generalisation error**

In [None]:
from sklearn.ensemble import IsolationForest
# We assume a 10% contamination based on boxplots
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

mask = yhat != -1
X_train, y_train = X_train[mask], y_train[mask]

In [None]:
print('Columns before outlier removal:', len(mask))
print('Columns after outlier removal:', sum(mask))
print('Columns removed in outlier removal:',(len(mask) - sum(mask)))

In [None]:
#Preprocessing/Encoding labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(y_train)

In [None]:
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)
y_valid = encoder.transform(y_valid)

# Data loading

In [None]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "Tabular2022New")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [None]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = X_train.columns + ["target"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [None]:
# Load Data Using Tensorflow
n_inputs = 100 # X_train.shape[-1]

@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return x, y

In [None]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [None]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [None]:
for X_batch, y_batch in valid_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)
    print()

# **Model Building and Training**

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

import keras
import tensorflow

import tensorflow.keras
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from keras.utils import np_utils
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.datasets import cifar10
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
import numpy as np

from keras.layers.convolutional import Conv1D, MaxPooling1D


import keras.utils
from keras import utils as np_utils

from keras.utils import np_utils


from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Convolution1D, Flatten, LeakyReLU
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import SpatialDropout1D, MaxPooling1D, Bidirectional, GRU, concatenate


# import necessary tools and models 
import seaborn as sns
import matplotlib.pyplot as plt 

import sklearn.model_selection as cv
from sklearn.model_selection import cross_val_score
import numpy as np

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import keras


In [None]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(500, input_dim=100, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(400, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(300, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation='softmax'),
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
batch_size = 32
model.fit(train_set, epochs=10,validation_data=valid_set,steps_per_epoch=len(X_train) // batch_size)

In [None]:
def getPredictionCSV(model, X_test):
    predictions = model.predict(X_test)
    predictions = [int(i) for i in predictions]
    finalPredicts = encoder.inverse_transform(predictions)
    sub=pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")
    sub['target'] = finalPredicts
    return sub

In [None]:
sub = getPredictionCSV(model,X_test)
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()