In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import neighbors
from xgboost import XGBRegressor


%matplotlib inline

In [None]:
d_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
d_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
d_train

In [None]:
# Printing the shape of train test data

print(d_train.shape, d_test.shape)

In [None]:
# Observing the data

d_train.info()

In [None]:
# Finding out the null values

d_train.isna().sum().sort_values(ascending=False).head(20)

In [None]:
df= d_train.isna().sum().sum()# total missing values
df

In [None]:
len(d_train)/ df *100 # total missing values in percentages

In [None]:
# Visualizing missing values in a dataset

plt.figure(figsize=(15,8))
sns.heatmap(d_train.isna(),cmap='Paired')

In [None]:
d_train['PoolQC']= d_train['PoolQC'].fillna('None')
d_train['MiscFeature']= d_train['MiscFeature'].fillna('None')
d_train['Alley']= d_train['Alley'].fillna('None')
d_train['Fence']= d_train['Fence'].fillna('None')
d_train['FireplaceQu']= d_train['FireplaceQu'].fillna('None')


In [None]:
# Impute "LotFrotage" column with its mean values

d_train['LotFrontage'] = d_train['LotFrontage'].fillna(d_train['LotFrontage'].mean())

In [None]:
# Filling these columns with zero values

d_train['GarageYrBlt']= d_train['GarageYrBlt'].fillna(0)
d_train['MasVnrArea']= d_train['MasVnrArea'].fillna(0)

In [None]:
# Filling the missing data with most_frequent values

d_train['GarageCond'] = d_train['GarageCond'].fillna(d_train['GarageCond'].value_counts().idxmax())
d_train['GarageType'] = d_train['GarageType'].fillna(d_train['GarageType'].value_counts().idxmax())
d_train['GarageFinish'] = d_train['GarageFinish'].fillna(d_train['GarageFinish'].value_counts().idxmax())
d_train['GarageQual'] = d_train['GarageQual'].fillna(d_train['GarageQual'].value_counts().idxmax())
d_train['BsmtFinType2'] = d_train['BsmtFinType2'].fillna(d_train['BsmtFinType2'].value_counts().idxmax())
d_train['BsmtExposure'] = d_train['BsmtExposure'].fillna(d_train['BsmtExposure'].value_counts().idxmax())
d_train['BsmtQual'] = d_train['BsmtQual'].fillna(d_train['BsmtQual'].value_counts().idxmax())
d_train['BsmtCond'] = d_train['BsmtCond'].fillna(d_train['BsmtCond'].value_counts().idxmax())
d_train['BsmtFinType1'] = d_train['BsmtFinType1'].fillna(d_train['BsmtFinType1'].value_counts().idxmax())
d_train['MasVnrType'] = d_train['MasVnrType'].fillna(d_train['MasVnrType'].value_counts().idxmax())
d_train['Electrical'] = d_train['Electrical'].fillna(d_train['Electrical'].value_counts().idxmax())

In [None]:
# Lets check if we have any missed values left

d_train.isna().sum().sort_values(ascending=False).head(20)

In [None]:
d_train.columns

In [None]:
# Visualizing missing values in a dataset

plt.figure(figsize=(15,8))
sns.heatmap(d_train.isna(),cmap='Paired')

**So we cleaned our dataset with zero null vales left** 

In [None]:
d_train.shape

In [None]:
# Defining how required variables correlate to our target variable

corr = d_train.corr()
corr.sort_values(['SalePrice'], ascending=False, inplace=True)
corr.SalePrice.head(10)

# Data Visualization

**We want to visualize top correlated columns with SalePrice** 

In [None]:
plt.figure(figsize=(25,20))

plt.subplot(5,5,1)
plt.title('OverallQual')
sns.barplot(x='OverallQual', y='SalePrice', data=d_train)

plt.subplot(5,5,2)
plt.title('GrLivArea')
plt.scatter(x='GrLivArea', y='SalePrice', data=d_train)

plt.subplot(5,5,3)
plt.title('GarageCars')
sns.barplot(x='GarageCars', y='SalePrice', data=d_train)

plt.subplot(5,5,4)
plt.title('GarageArea')
sns.scatterplot(x='GarageArea', y='SalePrice', data=d_train)

plt.subplot(5,5,5)
plt.title('TotalBsmtSF')
sns.scatterplot(x='TotalBsmtSF', y='SalePrice', data=d_train)

plt.subplot(5,5,6)
plt.title('1stFlrSF')
sns.scatterplot(x='1stFlrSF', y='SalePrice', data=d_train)

# Label Encoding 

In [None]:
label = LabelEncoder()
for i in d_train.columns:
    if d_train[i].dtypes == object:
        d_train[i]= label.fit_transform(d_train[i])

# Selecting the Features

In [None]:
X = d_train.drop('SalePrice', axis=1)
y = d_train['SalePrice']

# Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Creating Model with different functions

In [None]:
def decision_tree_model(X_train, y_train):
    # First using Decision Tree
    
    tree = DecisionTreeRegressor(random_state=1)
    tree.fit(X_train, y_train)
    y_prediction_tree = tree.predict(X_test)
    
    
    # metrics of decision tree regressor
    
    MeanAbErr_tree = mean_absolute_error(y_test, y_prediction_tree)
    MeanSqErr_tree= metrics.mean_squared_error(y_test, y_prediction_tree)
    RootMeanSqErr_tree= np.sqrt(metrics.mean_squared_error(y_test, y_prediction_tree))
    
    print('Decision Tree: ', r2_score(y_test,y_prediction_tree))
    print('Mean Absolute Error:', MeanAbErr_tree)
    print('Mean Square Error:', MeanSqErr_tree)
    print('Root Mean Square Error:', RootMeanSqErr_tree)
    
    # Visualizing
    
    plt.figure(figsize=(15,8))
    plt.scatter(y_test,y_prediction_tree,cmap='Paired',c='blue')
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(y_prediction_tree), max(y_test))
    p2 = min(min(y_prediction_tree), min(y_test))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.show()


In [None]:
decision_tree_model(X_train, y_train)

In [None]:
def rand_forest(X_train, y_train):
    #using RandomForestRegressor
    
    forest = RandomForestRegressor(n_estimators=100, random_state=0)
    forest.fit(X_train, y_train)
    y_prediction_forest = forest.predict(X_test)
    
    # metrics of random forest 
    
    MeanAbErr_tree = mean_absolute_error(y_test, y_prediction_forest)
    MeanSqErr_tree= metrics.mean_squared_error(y_test, y_prediction_forest)
    RootMeanSqErr_tree= np.sqrt(metrics.mean_squared_error(y_test, y_prediction_forest))
    
    # Printing
    
    print('Random Forest Regressor: ', r2_score(y_test,y_prediction_forest))
    print('Mean Absolute Error:', MeanAbErr_tree)
    print('Mean Square Error:', MeanSqErr_tree)
    print('Root Mean Square Error:', RootMeanSqErr_tree)
    
    # Visualizing
    
    plt.figure(figsize=(15,8))
    plt.scatter(y_test,y_prediction_forest,cmap='Paired',c='brown')
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(y_prediction_forest), max(y_test))
    p2 = min(min(y_prediction_forest), min(y_test))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.show()
    
    
    

In [None]:
rand_forest(X_train, y_train)

In [None]:
def knnreg(X_train, y_train):
    # using knn Regressor
    
    knn = neighbors.KNeighborsRegressor()
    knn.fit(X_train, y_train)
    y_prediction_knn = knn.predict(X_test)
    
    # metrics of knn regressor
    
    MeanAbErr_tree = mean_absolute_error(y_test, y_prediction_knn)
    MeanSqErr_tree= metrics.mean_squared_error(y_test, y_prediction_knn)
    RootMeanSqErr_tree= np.sqrt(metrics.mean_squared_error(y_test, y_prediction_knn))
    
    # Printing
    
    print('Knn neighbors Regressor: ', r2_score(y_test,y_prediction_knn))
    print('Mean Absolute Error:', MeanAbErr_tree)
    print('Mean Square Error:', MeanSqErr_tree)
    print('Root Mean Square Error:', RootMeanSqErr_tree)
    
    # Visualizing
    
    plt.figure(figsize=(15,8))
    plt.scatter(y_test,y_prediction_knn,cmap='Paired',c='green')
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(y_prediction_knn), max(y_test))
    p2 = min(min(y_prediction_knn), min(y_test))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.show()

    
    

In [None]:
knnreg(X_train, y_train)

In [None]:
def xboost(X_train, y_train):
    # using xgboost regressor
    
    xgboost = XGBRegressor()
    xgboost.fit(X_train, y_train)
    y_pred_xgboost = xgboost.predict(X_test)
    
    # metrics of xgboost regressor
    
    MeanAbErr_tree = mean_absolute_error(y_test, y_pred_xgboost)
    MeanSqErr_tree= metrics.mean_squared_error(y_test, y_pred_xgboost)
    RootMeanSqErr_tree= np.sqrt(metrics.mean_squared_error(y_test, y_pred_xgboost))
    
    # Printing
    
    print('XGBoost Regressor: ', r2_score(y_test,y_pred_xgboost))
    print('Mean Absolute Error:', MeanAbErr_tree)
    print('Mean Square Error:', MeanSqErr_tree)
    print('Root Mean Square Error:', RootMeanSqErr_tree)
    
    # Visualizing
    
    plt.figure(figsize=(15,8))
    plt.scatter(y_test,y_pred_xgboost,cmap='Paired',c='crimson')
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(y_pred_xgboost), max(y_test))
    p2 = min(min(y_pred_xgboost), min(y_test))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.show()

    

In [None]:
xboost(X_train, y_train)

In [None]:
def linear_reg(X_train, y_train):
    # using Linear regressor
    
    lin = LinearRegression()
    lin.fit(X_train, y_train)
    y_pred_linear = lin.predict(X_test)
    
    # metrics of linear regressor
    
    MeanAbErr_tree = mean_absolute_error(y_test, y_pred_linear)
    MeanSqErr_tree= metrics.mean_squared_error(y_test, y_pred_linear)
    RootMeanSqErr_tree= np.sqrt(metrics.mean_squared_error(y_test, y_pred_linear))
    
    # Printing
    
    print('Linear Regressor: ', r2_score(y_test,y_pred_linear))
    print('Mean Absolute Error:', MeanAbErr_tree)
    print('Mean Square Error:', MeanSqErr_tree)
    print('Root Mean Square Error:', RootMeanSqErr_tree)
    
    # Visualizing
    
    plt.figure(figsize=(15,8))
    plt.scatter(y_test,y_pred_linear,cmap='Paired')
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(y_pred_linear), max(y_test))
    p2 = min(min(y_pred_linear), min(y_test))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.show()

    

In [None]:
linear_reg(X_train, y_train)

**Now It is time to implement our model into d_test**

In [None]:
d_test.shape

In [None]:
# Im goona use the same method with d_test to fill missing values 

In [None]:
d_test.isna().sum().sort_values(ascending=False)

In [None]:
d_test['PoolQC']= d_test['PoolQC'].fillna('None')
d_test['MiscFeature']= d_test['MiscFeature'].fillna('None')
d_test['Alley']= d_test['Alley'].fillna('None')
d_test['Fence']= d_test['Fence'].fillna('None')
d_test['FireplaceQu']= d_test['FireplaceQu'].fillna('None')

In [None]:
# Impute "LotFrotage" column with its mean values

d_test['LotFrontage'] = d_test['LotFrontage'].fillna(d_test['LotFrontage'].mean())

In [None]:
# Filling these columns with zero values

d_test['GarageYrBlt']= d_test['GarageYrBlt'].fillna(0)
d_test['MasVnrArea']= d_test['MasVnrArea'].fillna(0)

In [None]:
# Filling the missing data with most_frequent values

d_test['GarageCond'] = d_test['GarageCond'].fillna(d_train['GarageCond'].value_counts().idxmax())
d_test['GarageType'] = d_test['GarageType'].fillna(d_train['GarageType'].value_counts().idxmax())
d_test['GarageFinish'] = d_test['GarageFinish'].fillna(d_train['GarageFinish'].value_counts().idxmax())
d_test['GarageQual'] = d_test['GarageQual'].fillna(d_train['GarageQual'].value_counts().idxmax())
d_test['BsmtFinType2'] = d_test['BsmtFinType2'].fillna(d_train['BsmtFinType2'].value_counts().idxmax())
d_test['BsmtExposure'] = d_test['BsmtExposure'].fillna(d_train['BsmtExposure'].value_counts().idxmax())
d_test['BsmtQual'] = d_test['BsmtQual'].fillna(d_train['BsmtQual'].value_counts().idxmax())
d_test['BsmtCond'] = d_test['BsmtCond'].fillna(d_train['BsmtCond'].value_counts().idxmax())
d_test['BsmtFinType1'] = d_test['BsmtFinType1'].fillna(d_train['BsmtFinType1'].value_counts().idxmax())
d_test['MasVnrType'] = d_test['MasVnrType'].fillna(d_train['MasVnrType'].value_counts().idxmax())
d_test['Electrical'] = d_test['Electrical'].fillna(d_train['Electrical'].value_counts().idxmax())
d_test['MSZoning'] = d_test['MSZoning'].fillna(d_train['MSZoning'].value_counts().idxmax())
d_test['BsmtFullBath'] = d_test['BsmtFullBath'].fillna(d_train['BsmtFullBath'].value_counts().idxmax())
d_test['BsmtHalfBath'] = d_test['BsmtHalfBath'].fillna(d_train['BsmtHalfBath'].value_counts().idxmax())
d_test['Functional'] = d_test['Functional'].fillna(d_train['Functional'].value_counts().idxmax())
d_test['Utilities'] = d_test['Utilities'].fillna(d_train['Utilities'].value_counts().idxmax())
d_test['Exterior2nd'] = d_test['Exterior2nd'].fillna(d_train['Exterior2nd'].value_counts().idxmax())
d_test['SaleType'] = d_test['SaleType'].fillna(d_train['SaleType'].value_counts().idxmax())
d_test['Exterior1st'] = d_test['Exterior1st'].fillna(d_train['Exterior1st'].value_counts().idxmax())
d_test['KitchenQual'] = d_test['KitchenQual'].fillna(d_train['KitchenQual'].value_counts().idxmax())

In [None]:
d_test['BsmtFinSF2'] = d_test['BsmtFinSF2'].fillna(d_train['BsmtFinSF2'].mean())
d_test['GarageArea'] = d_test['GarageArea'].fillna(d_train['GarageArea'].mean())
d_test['BsmtFinSF1'] = d_test['BsmtFinSF1'].fillna(d_train['BsmtFinSF1'].mean())
d_test['GarageCars'] = d_test['GarageCars'].fillna(d_train['GarageCars'].mean())
d_test['TotalBsmtSF'] = d_test['TotalBsmtSF'].fillna(d_train['TotalBsmtSF'].mean())
d_test['BsmtUnfSF'] = d_test['BsmtUnfSF'].fillna(d_train['BsmtUnfSF'].mean())


In [None]:
d_test.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
label = LabelEncoder()
for x in d_test.columns:
    if d_test[x].dtypes == object:
        d_test[x]= label.fit_transform(d_test[x].astype(str))

**XGBoost and Random Forest Regressor are the best predictors with lowest errors.
But Im going to take XGBoost model for the submission ****

In [None]:
xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred_xgboost = xgboost.predict(d_test)
y_pred_xgboost.shape

In [None]:
y_pred_xgboost

In [None]:
id_test = d_test['Id']
d_test = pd.DataFrame(d_test, columns=['ID'])

In [None]:
prediction = pd.DataFrame(y_pred_xgboost, columns=["SalePrice"])


In [None]:
output = pd.concat([id_test, prediction],axis=1)

In [None]:
output

# Submission

In [None]:
output.to_csv('submission.csv', index=False)