In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the required libraries:

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,accuracy_score
import math
import seaborn as sns

# Importing Data:

In [None]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
train.head()

# Data Analysis:

In [None]:
print(train.shape)
print(test.shape)

In [None]:
comp_data = pd.concat((train,test))
comp_data_1 = comp_data
comp_data.shape

In [None]:
train.info()

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(train.isnull())

In [None]:
missing = train.isnull().sum() / len(train)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

# Training Data Missing Values Handling

In [None]:
## So from the above heatmap let us start the work of filling Missing Values:
train['LotFrontage'] = train['LotFrontage'] .fillna(train['LotFrontage'].mean())

In [None]:
## The alley feature has maximum missing values so lets drop it:
train.drop(['Alley'],axis=1,inplace=True)

In [None]:
## Now let us handle Bsmt features:
train['BsmtCond'] = train['BsmtCond'].fillna(train['BsmtCond'].mode()[0])
train['BsmtQual'] = train['BsmtQual'].fillna(train['BsmtQual'].mode()[0])
train['BsmtExposure'] = train['BsmtExposure'].fillna(train['BsmtExposure'].mode()[0])
train['BsmtFinType2'] = train['BsmtFinType2'].fillna(train['BsmtFinType2'].mode()[0])

In [None]:
train['FireplaceQu'] = train['FireplaceQu'].fillna(train['FireplaceQu'].mode()[0])
train['GarageType'] = train['GarageType'].fillna(train['GarageType'].mode()[0])

In [None]:
## Let us work now all Garage features at once:
train.drop(['GarageYrBlt'],axis=1,inplace=True)
train['GarageFinish'] = train['GarageFinish'].fillna(train['GarageFinish'].mode()[0])
train['GarageQual'] = train['GarageQual'].fillna(train['GarageQual'].mode()[0])
train['GarageCond'] = train['GarageCond'].fillna(train['GarageCond'].mode()[0])

In [None]:
## Now let us drop all the highly missing values Columns:
train.drop(['PoolQC','Fence','MiscFeature'],axis=1,inplace=True)

In [None]:
train.shape

In [None]:
train.drop(['Id'],axis=1,inplace=True)

In [None]:
train.isnull().sum()

In [None]:
## There are some missing values left:
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mode()[0])
train['MasVnrType'] = train['MasVnrType'].fillna(train['MasVnrType'].mode()[0])

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(train.isnull())

In [None]:
# Lets drop the leftover na values
train.dropna(inplace=True)

In [None]:
sns.heatmap(train.isnull())

## Handle Categorical Values:

In [None]:
# cat_val = train.select_dtypes(exclude=[np.number])
# cat_val.columns
cat_val = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities','LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2','BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st','Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual','Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
len(cat_val)

In [None]:
## With the help of OneHot Multivalues Encoding lets encode the data:
def onehot_encod(multcols):
  train_1 = train_2
  i = 0
  for cols in multcols:
    print(cols)
    train_3 = pd.get_dummies(train_2[cols],drop_first=True)
    train_2.drop([cols],axis=1,inplace=True)
    if i==0:
      train_1 = train_3.copy()
    else:
      train_1 = pd.concat((train_1,train_3))
    i+=1
  train_1 = pd.concat((train_2,train_1))
  return train_1

In [None]:
# Creating a copy of the cleaned data:
copy_train = train.copy()

## Training data Cleaning:

In [None]:
test.isnull().sum()

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(test.isnull())

In [None]:
test['LotFrontage'] = test['LotFrontage'] .fillna(test['LotFrontage'].mean())
test['MSZoning'] = test['MSZoning'].fillna(test['MSZoning'].mode()[0])
test.drop(['Alley'],axis=1,inplace=True)
test['BsmtCond'] = test['BsmtCond'].fillna(test['BsmtCond'].mode()[0])
test['BsmtQual'] = test['BsmtQual'].fillna(test['BsmtQual'].mode()[0])
test['BsmtExposure'] = test['BsmtExposure'].fillna(test['BsmtExposure'].mode()[0])
test['BsmtFinType2'] = test['BsmtFinType2'].fillna(test['BsmtFinType2'].mode()[0])
test['FireplaceQu'] = test['FireplaceQu'].fillna(test['FireplaceQu'].mode()[0])
test['GarageType'] = test['GarageType'].fillna(test['GarageType'].mode()[0])
test.drop(['GarageYrBlt'],axis=1,inplace=True)
test['GarageFinish'] = test['GarageFinish'].fillna(test['GarageFinish'].mode()[0])
test['GarageQual'] = test['GarageQual'].fillna(test['GarageQual'].mode()[0])
test['GarageCond'] = test['GarageCond'].fillna(test['GarageCond'].mode()[0])
test.drop(['PoolQC','Fence','MiscFeature'],axis=1,inplace=True)
test['MasVnrArea'] = test['MasVnrArea'].fillna(test['MasVnrArea'].mode()[0])
test['MasVnrType'] = test['MasVnrType'].fillna(test['MasVnrType'].mode()[0])
test.drop(['Id'],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(test.isnull())

In [None]:
## Lets check for some null values if any:
test.loc[:,test.isnull().any()].head()

In [None]:
test['Utilities'] = test['Utilities'].fillna(test['Utilities'].mode()[0])
test['Exterior1st'] = test['Exterior1st'].fillna(test['Exterior1st'].mode()[0])
test['Exterior2nd'] = test['Exterior2nd'].fillna(test['Exterior2nd'].mode()[0])
test['BsmtFinType1'] = test['BsmtFinType1'].fillna(test['BsmtFinType1'].mode()[0])
test['BsmtFinSF1'] = test['BsmtFinSF1'].fillna(test['BsmtFinSF1'].mean())
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(test['BsmtFinSF2'].mean())
test['BsmtUnfSF'] = test['BsmtUnfSF'].fillna(test['BsmtUnfSF'].mean())
test['TotalBsmtSF'] = test['TotalBsmtSF'].fillna(test['TotalBsmtSF'].mean())
test['BsmtFullBath'] = test['BsmtFullBath'].fillna(test['BsmtFullBath'].mode()[0])
test['BsmtHalfBath'] = test['BsmtHalfBath'].fillna(test['BsmtHalfBath'].mode()[0])
test['KitchenQual'] = test['KitchenQual'].fillna(test['KitchenQual'].mode()[0])
test['Functional'] = test['Functional'].fillna(test['Functional'].mode()[0])
test['GarageCars'] = test['GarageCars'].fillna(test['GarageCars'].mean())
test['GarageArea'] = test['GarageArea'].fillna(test['GarageArea'].mean())
test['SaleType'] = test['SaleType'].fillna(test['SaleType'].mode()[0])

In [None]:
test.shape

## Encoding:

In [None]:
# Concatenating the clean csv:
train_2 = pd.concat((train,test))

In [None]:
train_2['SalePrice']

In [None]:
train_2.shape

In [None]:
# Now lets send it to OnehotEncoding:
train_2 = onehot_encod(cat_val)

In [None]:
train_2.shape

In [None]:
## Now by deleting all the duplicate values:
train_2.drop_duplicates(inplace=True)
train_2.shape

In [None]:
train = train_2.iloc[:1422,:]
test = train_2.iloc[1422:2881,:]
print(train.shape)
test.shape

In [None]:
test.drop(['SalePrice'],axis=1,inplace=True)

# Applying Algorithm and Predictions:

In [None]:
X_train = train.drop(['SalePrice'],axis=1)
y_train = train['SalePrice']

In [None]:
test.tail()

## XGBoost:

In [None]:
import xgboost as xgb
regressor = xgb.XGBRegressor(n_estimators=1000,learning_rate=0.01,early_stopping_rounds=10)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(test)

In [None]:
pred = pd.DataFrame(y_pred)
sub_file = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
xg_pred = pd.concat([sub_file['Id'],pred],axis=1)
xg_pred.columns=['Id','SalePrice']
xg_pred.to_csv('./xgbnono_O_2.csv',index=False)

# --------------------------------------------------------------------------