In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test=pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
print(f'Shape of the train dataframe is {train.shape}')
print(f'Shape of the test dataframe is {test.shape}')

## Check for null values in the train dataframe.

In [None]:
train_nulls=pd.DataFrame(np.c_[train.isna().sum()],columns=['Num_of_Nulls'],index=train.isna().sum().index)
train_nulls

In [None]:
test_nulls=pd.DataFrame(np.c_[test.isna().sum()],columns=['Num_of_Nulls'],index=test.isna().sum().index)
test_nulls

## No need to handle null values in both train and test datasets.

In [None]:
train_dtypes=list(train.dtypes)
test_dtypes=list(test.dtypes)
print(f'Datatypes in train are {train_dtypes}')
print(f'Datatypes in test are {test_dtypes}')

## All  the datatypes are float64 only

In [None]:
train_cols=list(train.columns)
test_cols=list(test.columns)
print(f'Train columns are {train_cols}')
print(f'Test columns are {test_cols}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train.describe()

In [None]:
plt.figure(figsize=(10,10))
imp_cols=train_cols[1:15]
target_cols=train_cols[15]
corrl_matrix=train[imp_cols].corr()
sns.heatmap(corrl_matrix,cbar=True);

In [None]:
corrl_features=[]
for i in range(len(corrl_matrix)):
    for j in range(i):
        if abs(corrl_matrix.iloc[i,j])>0.8:
            col_name=corrl_matrix.columns[i]
            corrl_features.append(col_name)
print(corrl_features)

# Dropping this correlated features from Train and Test datasets

In [None]:
train=train.drop(columns=corrl_features,axis=1)
test=test.drop(columns=corrl_features,axis=1)
print(f'After dropping correlated features shape of train is {train.shape}')
print(f'After dropping correlated features shape of test is {test.shape}')

## Check for duplicated rows in Train and Test Datasets.

In [None]:
print(f"The number of duplicate rows in train dataset are {train.duplicated().sum()}")
print(f"The number of duplicate rows in test dataset are {test.duplicated().sum()}")

# EDA on Train columns.

In [None]:
sns.distplot(train['target']);
plt.xticks(range(0,10));

In [None]:
imp_cols=[col for col in train.columns if col.startswith('cont')]
print(f"Length of important columns in train dataset are : {len(imp_cols)}")

In [None]:
plt.figure(figsize=(90,45))
for i in range(1,len(imp_cols)+1):
    sns.distplot(train[imp_cols[i-1]],ax=plt.subplot(8,2,i))
    plt.title(f'{imp_cols[i-1]}',fontsize=10)
plt.savefig('./train.png')

## Since most of the fields are having long tails have applied log transformations on those fields.

In [None]:
import copy
train_copy=copy.deepcopy(train)

In [None]:
train_copy_imp_cols=imp_cols+['target']
train_copy_imp_cols

In [None]:
for i in train_copy_imp_cols:
    train_copy[i]=train_copy[i].apply(lambda x:np.log1p(x))

In [None]:
train_copy.head()

In [None]:
plt.figure(figsize=(90,45))
for i in range(1,len(train_copy_imp_cols)+1):
    sns.distplot(train_copy[train_copy_imp_cols[i-1]],ax=plt.subplot(8,2,i))
    plt.title(f'{train_copy_imp_cols[i-1]}',fontsize=10)
plt.savefig('./train_copy.png')

# Lets build the model using simple linear regression and check hows the performance would be without any hyper parameter tuning.

* Lets fit it on train copy dataset.

In [None]:
columns=[col for col in train_copy.columns if col not in ['id','target']]
X=train_copy[columns]
y=train_copy['target']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X,y)
print(f'Models score is {model.score(X,y)}')

# Applying same log transformations on test dataset and lets predict the score on test datasets.

In [None]:
test.head()

In [None]:
for col in columns:
    test[col]=test[col].apply(lambda x:np.log1p(x))

In [None]:
X_test=test.iloc[:,1:]

In [None]:
predicted_values=model.predict(X_test)

In [None]:
submission_df=pd.DataFrame(np.c_[test['id'],predicted_values],columns=['id','target'])
submission_df['id']=submission_df['id'].astype('int')
submission_df.shape

In [None]:
submission_df.dtypes

In [None]:
submission_df.to_csv('./submission.csv',index=False)

In [None]:
from sklearn.preprocessing import Normalizer
norm=Normalizer()
X=norm.fit_transform(X)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K


In [None]:
input_layer = Input(shape=(13,),dtype="float64",name="input_layer")
dense_layer1= Dense(units=128,activation='relu',kernel_initializer='he_normal',name='dense_layer_1')(input_layer)
dense_layer2= Dense(units=64,activation='relu',kernel_initializer='he_normal',name='dense_layer_2')(dense_layer1)
dense_layer3= Dense(units=32,activation='relu',kernel_initializer='he_normal',name='dense_layer_3')(dense_layer2)
dense_layer4= Dense(units=16,activation='relu',kernel_initializer='he_normal',name='dense_layer_4')(dense_layer3)
dense_layer5= Dense(units=8,activation='relu',kernel_initializer='he_normal',name='dense_layer_5')(dense_layer4)
output_layer= Dense(units=1,activation='linear',name='output_layer')(dense_layer5)
model=Model(inputs=input_layer,outputs=output_layer)
model.summary()

In [None]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 


In [None]:
model.compile(optimizer='sgd',loss=root_mean_squared_error,metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [None]:
model.fit(X,y,epochs=30,verbose=1)

In [None]:
X_test=norm.transform(X_test)

In [None]:
predicted_values=model.predict(X_test)

In [None]:
submission_df=pd.DataFrame(np.c_[test['id'],predicted_values],columns=['id','target'])
submission_df['id']=submission_df['id'].astype('int')
submission_df.shape

In [None]:
submission_df['target']=submission_df['target'].apply(lambda x:np.exp(x)-1)

In [None]:
submission_df.to_csv('submission.csv',index=False)