In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#=======================================================================================
# Importing the libaries:
#=======================================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy import stats
import math 
import warnings
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

#========================
ordinal_encoder = OrdinalEncoder()
imputer = SimpleImputer(strategy="median")
enc = OneHotEncoder(sparse=False,handle_unknown='ignore')
#===========================
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
#=======================================================================================

### 1 - IMPORTATION OF THE DATA ###

In [None]:
def read_data():
    train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
    print("Train data imported successfully!!")
    print("-"*50)
    test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
    print("Test data imported successfully!!")
    return train_data , test_data

In [None]:
train_data , test_data = read_data()

Now, we need to drop the 'Id' columns which is useless for the analysis.

In [None]:
# Save the 'Id' column
test_ID = test_data['Id']

# Now drop the 'Id' column since it's unnecessary for  the prediction process.
train_data.drop("Id", axis = 1, inplace = True)
test_data.drop("Id", axis = 1, inplace = True)

Let's look the correlation matrix and the most important attribut.

In [None]:
corrmat = train_data.corr()
k = 11 #number of variables for heatmap
correlated_cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
correlated_cols

In [None]:
plt.figure(figsize= (15 , 15))
sns.heatmap(corrmat,cmap="Blues")

### 2 - MODIFY THE DATA AND FILL THE MISSING VALUES ###

In [None]:
def check_missed_values(all_data):
    all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
    return missing_data

We regroup all the data in order to apply the same transformation.

In [None]:
all_data = pd.concat([train_data, test_data]).reset_index(drop=True)
sale_price = train_data["SalePrice"]
all_data.drop(columns = ["SalePrice"] , inplace = True)

We divide the data into the object attribute and the numerical one.

In [None]:
numerical_data = all_data.dtypes[all_data.dtypes != "object"].index
object_data = all_data.dtypes[all_data.dtypes == "object"].index

Let's try to fill the missing values.

In [None]:
missing_data = check_missed_values(all_data[object_data])
missing_data

In [None]:
data_object = train_data[object_data]
data_object[missing_data.index] = data_object[missing_data.index].fillna("None")
data_object = data_object.astype('str')

In [None]:
data_object =ordinal_encoder.fit_transform(data_object)
data_object = pd.DataFrame(data_object, columns=object_data)
data_correlation = pd.concat([data_object, sale_price], axis=1)

In [None]:
corr_object = data_correlation.corr()

In [None]:
k = 10 #number of variables for heatmap
correlated_object_cols2 = corr_object['SalePrice'].abs().sort_values()
impt_object = correlated_object_cols2[-11:-1].index

Any object seem relavant to the analysis.

### LET'S BUILD OUR MODEL ###

In [None]:
import tensorflow as tf

In [None]:
impt_num_data = all_data[correlated_cols[1:]]
impt_object_data =  all_data[impt_object].fillna("None").astype("str")

data_train_final_num, data_test_final_num= impt_num_data.iloc[1460,:],impt_num_data.iloc[1460:,:]
data_train_final_obj, data_test_final_obj= impt_object_data.iloc[1460,:],impt_object_data.iloc[1460:,:]

Let's fill missing value with a median number and scale the value !

In [None]:
pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy="median")),
    ])

impt_num_data = pipeline.fit_transform(impt_num_data) 
impt_object_data = ordinal_encoder.fit_transform(impt_object_data) 
impt_object_data = pipeline.fit_transform(impt_object_data)
data_num, data_object = tf.convert_to_tensor(impt_num_data), tf.convert_to_tensor(impt_object_data)
data = tf.concat([data_num, data_object], 1)
Xtrain, Xtest = data[:1460,:],data[1460:,]
Y = tf.convert_to_tensor(sale_price)

We split finally the training and test data.

In [None]:
# Set random seed
tf.random.set_seed(42)

# Build the model (3 layers, 100, 10, 1 units)
model = tf.keras.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(1, activation="linear")
])


## Compile the model
model.compile(loss=tf.keras.losses.MeanSquaredError(),
                          optimizer=tf.keras.optimizers.Adam(0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name='Adam'),
                          metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])

## Fit the model for 200 epochs (same as insurance_model_2)
model.fit(Xtrain, Y, epochs=50) 





In [None]:
Y_predi = np.array(tf.reduce_mean(model.predict(Xtest), 1))
np.array(test_ID)

### 4 - EXPORT THE RESULT ###

In [None]:
# Read in sample_submission dataframe


output = pd.DataFrame(columns=["Id","SalePrice"])
output["Id"] = np.array(test_ID)
output["SalePrice"] = Y_predi
output["Id"] = output["Id"].astype("int")

In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)
print('Submission succesful!')