In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **HOUSE PRICE PREDICTION**<br>

This notebook goes through the entire dataset available to us by finding out correlations in data, cleaning it, performing EDA and using these finding a best fit Machine Learning Algorithm to develop a learning equation for the dataset.<br><br>

The research question we attempt to solve include :<br><br>

1. How is the data structured ?
2. Does the data have Nulls or Duplicates or Outliers?
3. How is the distribution of various columns in the data ?
4. Are columns related to each other ?
5. Can we reduce the features without affecting the accuracy ?
6. What is the best sale price of any given house ?


In [None]:
# We will perform Data Ingestion First

TRAIN_PATH='../input/house-prices-advanced-regression-techniques/train.csv'
TEST_PATH='../input/house-prices-advanced-regression-techniques/test.csv'

train_data=pd.read_csv(TRAIN_PATH)
test_data=pd.read_csv(TEST_PATH)

print("Shape of Train Data is ",train_data.shape)
print("Shape of Test Data is ",test_data.shape)

In [None]:
train_data.head()

In [None]:
test_data.head()

# **DATA CLEANING**

In [None]:
train_data.isna().sum()[train_data.isna().sum()>0]

In [None]:
train_data.fillna('Unknown',inplace=True)

In [None]:
test_data.isna().sum()[test_data.isna().sum()>0]

In [None]:
test_data.fillna('Unknown',inplace=True)

# **DATA DESCRIPTION**

In [None]:
# Now we will describe the data

# First we will get different dtype columns in the data
print('Numeric Type Columns - Train Data\n')
print(list(train_data._get_numeric_data().columns),'\n\n')

print('Categorical Type Columns - Train Data\n')
print(list(set(train_data.columns)-set(train_data._get_numeric_data().columns)),'\n\n')

print('Numeric Type Columns - Test Data\n')
print(list(test_data._get_numeric_data().columns),'\n\n')

print('Categorical Type Columns - Test Data\n')
print(list(set(test_data.columns)-set(test_data._get_numeric_data().columns)),'\n\n')


In [None]:
print("Training Data Description\n")
train_data.describe().transpose()

In [None]:
print("Testing Data Description\n")
test_data.describe().transpose()

In [None]:
train_data_numeric=list(train_data._get_numeric_data().columns)
train_data_category=list(set(train_data.columns)-set(train_data._get_numeric_data().columns))

test_data_numeric=list(test_data._get_numeric_data().columns)
test_data_category=list(set(test_data.columns)-set(test_data._get_numeric_data().columns))
                         


# **ONE HOT ENCODING OF CATEGORICAL FEATURES**

In [None]:
oe=OrdinalEncoder()
for col in train_data_category:
    train_data[col]=oe.fit_transform(np.asarray(train_data[col].astype('str')).reshape(-1,1))
for col in test_data_category:
    test_data[col]=oe.fit_transform(np.asarray(test_data[col].astype('str')).reshape(-1,1))


In [None]:
train_data.head()

In [None]:
test_data.head()

# **OUTLIER CAPPING OF NUMERIC DATA**

In [None]:
l=list(set(train_data._get_numeric_data().columns))
for col in l:
    if(col=='Id'):
        continue
    upper_limit = int(train_data[col].mean() + 3*train_data[col].std())
    lower_limit = int(train_data[col].mean() - 3*train_data[col].std())
    train_data[col] = np.where(train_data[col]>upper_limit,upper_limit,np.where(train_data[col]<lower_limit,lower_limit,train_data[col]))

for col in l:
    plt.figure(figsize=(10,1))
    sns.boxplot(data=train_data[l],x=train_data[col],orient='h')

In [None]:
l=list(set(test_data._get_numeric_data().columns))
for col in l:
    if(col=='Id'):
        continue
    upper_limit = int(test_data[col].mean() + 3*test_data[col].std())
    lower_limit = int(test_data[col].mean() - 3*test_data[col].std())
    test_data[col] = np.where(test_data[col]>upper_limit,upper_limit,np.where(test_data[col]<lower_limit,lower_limit,test_data[col]))

for col in l:
    plt.figure(figsize=(10,1))
    sns.boxplot(data=test_data[l],x=test_data[col],orient='h')

In [None]:
train_data.head()

In [None]:
test_data.head()

# **DATA CORRELATION**

In [None]:
train_data.corr()

In [None]:
test_data.corr()

# **REGRESSION USING NEURAL NETWORKS**

In [None]:
from sklearn.neural_network import MLPRegressor
X=train_data.iloc[:,0:-1]
y=train_data.loc[0:1459,'SalePrice']
model=MLPRegressor(activation='relu',solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant')

model.fit(X,y)

In [None]:
from sklearn.metrics import mean_squared_error
pred=model.predict(test_data)

print(np.sqrt(mean_squared_error(y[0:1459],pred)))

In [None]:
df=pd.DataFrame({'Id':test_data['Id'],'SalePrice':pred})
df

In [None]:
df.to_csv('submission.csv',index=False)