In [None]:
# importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch


In [None]:
data = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
df = pd.DataFrame(data)

In [None]:
df.head()

### Data Information

In [None]:
df.describe()


In [None]:
col = df.columns  #getting list of column names

In [None]:
# showing column wise %ge of NaN values they contains 

for i in col:
  print(i,"\t-\t", df[i].isna().mean()*100)


> Since data does'nt contain any null values, we can move further

> Since the given dataset contains both categorical and numerical dataset we have to separate them for further analysis. 

In [None]:
num_df = df.select_dtypes(exclude=['object'])
cat_df= df.drop(num_df, axis=1)

> Now start analysis with numerical data.

In [None]:
num_df.head()

In [None]:
num_df.describe()

In [None]:
num_df = num_df.drop(["id"], axis = 1)   #Since Id does not has any role in price prediction of houses

In [None]:
cormap = num_df.corr()
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cormap, annot = True)

> Now let's analyse the categorical part of dataset.



In [None]:
cat_df.head()

In [None]:
cat_df.describe()

> Let's first encode the categorical data into numerical for futher analysis

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_col = cat_df.columns
for i in cat_col:
  enc = LabelEncoder()
  cat_df[i] = enc.fit_transform(cat_df[i].astype('str'))

In [None]:
cat_df.head()


In [None]:
cat_df['target'] = df['target']  # to get coreltion with target attribute

In [None]:
cormat = cat_df.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cormat, annot = True)

In [None]:
final_df = pd.concat([ cat_df.drop(['target'], axis=1), num_df], axis = 1, sort=False)
final_df.head()

In [None]:
X = final_df.drop(['target'], axis=1)
y = final_df['target']

> Since range of data in different columns veries significantly we need to scale the independent variable i.e. X. For this we will use _Min-Max Scaling_.


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### ANN

> Here to determine no of hidden layers and no of neurons in each layer, I'm using [Keras Tuner](https://www.tensorflow.org/tutorials/keras/keras_tuner). Keras Tuner can be proved very helpful for hyperparameter tunning of neural networks.

In [None]:
def build_model(hp):
    model = keras.Sequential()
    
    model.add(layers.Dense(24, activation='relu'))
    
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(layers.Dense(units = hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                               activation = 'relu'))
        model.add(layers.Dropout(0.5))
        
    model.add(layers.Dense(1, activation='linear'))
    
    model.compile(
        optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='mean_absolute_error',
        metrics=['mean_absolute_error'])
    return model

> Using Random Search for itereating over parameters

In [None]:
tuner = RandomSearch(
    build_model,
    objective='mean_absolute_error',
    max_trials=10,
    executions_per_trial=1)

In [None]:
tuner.search(X_train, y_train, epochs=5)


In [None]:
# Choosing model with least Mean Absolute Error

reg = tuner.get_best_models(num_models=1)[0]

In [None]:
reg.fit(X_train, y_train, epochs=20, validation_split=0.1, initial_epoch=5)

In [None]:
reg.summary()

In [None]:
# Prediction

y_pred = reg.predict(X_test)
pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})
pred_df.head()

In [None]:
#Evaluating the Model

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

> As we can see that the value of root mean squared error is 0.879, which is slightly lesser than 15% of the mean value.

*****

In [None]:
# Here we ready the Test Data
test_data = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
test_df = pd.DataFrame(test_data)
test_df.head()

In [None]:
num_test_df = test_df.select_dtypes(exclude=['object'])
cat_test_df= test_df.drop(num_test_df, axis=1)

In [None]:
Id = num_test_df['id']
num_test_df = num_test_df.drop(["id"], axis = 1)

In [None]:
# Let's Encode the test categorical dataset also

for i in cat_test_df.columns:
    enc = LabelEncoder()
    cat_test_df[i] = enc.fit_transform(cat_test_df[i].astype('str'))

cat_test_df.head()

In [None]:
final_test_df = pd.concat([ cat_test_df, num_test_df], axis = 1, sort=False)
final_test_df.head()

In [None]:
X = pd.DataFrame(scaler.transform(final_test_df), columns=final_test_df.columns)
X.head()

In [None]:
Y_pred = reg.predict(X)

In [None]:
final_df = pd.DataFrame({'id': Id, 'target': Y_pred.flatten()})

In [None]:
final_df.head()

In [None]:
final_df.to_csv('./submission.csv', index=False)