In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Import visualization libraries 
import matplotlib.pyplot as plt # Matlab-style plotting
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import classification_report, confusion_matrix

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## First we read our data into a pandas dataframe 

In [None]:
# Reading the train and the test datasets 
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Now let's start understanding our data

In [None]:
# How many rows and columns we have in each dataset 
print(train.shape)
print('*' * 10)
print(test.shape)

In [None]:
# let's try to understand our data
train.head()

In [None]:
# Get some information about our data
# As we can see we have null values which we need to deal with 
print(train.info())
print('*' * 30)
print(test.info())

# Dealing with missing values

### First Identify the missing data 

In [None]:
# Finding null values 
# As our data is large so we better visualize them
train.isnull().sum()

In [None]:
# Calculate percentage of our missing values
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/train.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
# As our data is large it's better to visualize the missing values
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
missing_data.head()

In [None]:
# Same for testing data 
total = test.isnull().sum().sort_values(ascending=False)
percent = ((test.isnull().sum()/test.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
missing_data.head()

In [None]:
# A common approach is that we drop all columns which their missing values exceeds 60%
train.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)
test.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)

In [None]:
# Now let's deal with the rest of the missing data
train.isnull().sum()

In [None]:
# We need to know the values we have to determine which approach to use when filling the data 
train.LotFrontage.nunique()

In [None]:
# My option would be to randomly fill these missing values with values close to the mean but within one standard deviation.
# As our missing data is large so we don't want to fill the data with mean values so now change in the dist happen
LotFrontage_avg = train['LotFrontage'].mean()
LotFrontage_std = train['LotFrontage'].std()
LotFrontage_null_count = train['LotFrontage'].isnull().sum()
LotFrontage_null_random_list = np.random.randint(LotFrontage_avg - LotFrontage_std, LotFrontage_avg + LotFrontage_std, size=LotFrontage_null_count)
train['LotFrontage'][np.isnan(train['LotFrontage'])] = LotFrontage_null_random_list
train['LotFrontage'] = train['LotFrontage'].astype(int)

In [None]:
# Same for Test dataset
LotFrontage_avg = test['LotFrontage'].mean()
LotFrontage_std = test['LotFrontage'].std()
LotFrontage_null_count = test['LotFrontage'].isnull().sum()
LotFrontage_null_random_list = np.random.randint(LotFrontage_avg - LotFrontage_std, LotFrontage_avg + LotFrontage_std, size=LotFrontage_null_count)
test['LotFrontage'][np.isnan(test['LotFrontage'])] = LotFrontage_null_random_list
test['LotFrontage'] = test['LotFrontage'].astype(int)

In [None]:
# Calculating percentage of missing values 
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/train.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

In [None]:
# Calculating percentage of missing values 
total = test.isnull().sum().sort_values(ascending=False)
percent = ((test.isnull().sum()/test.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

In [None]:
# Before dealing with GarageFinish we need to remember it's data type 
# This will help us determine which way we fill the missing data
train.GarageFinish.dtype

In [None]:
# What are the values of GarageFinish
train.GarageFinish.value_counts(dropna=False)

In [None]:
# Trying to find if their is a relation between GarageCars & GarageFinish
# We found that whenever a GarageFinish is null Garage cars == 0
train['GarageCars'][train['GarageFinish'].isnull() == True].head(20)

In [None]:
test['GarageCars'][test['GarageFinish'].isnull() == True].head()

In [None]:
train['GarageCars'].fillna(value=0, inplace=True)
test['GarageCars'].fillna(value=0, inplace=True)

### So we found out that we can convert  the missing values into a category 
#### We 'll call it Nfn which stands for No Finish 

In [None]:
# Create a new category we will call it Nfn
train['GarageFinish'].fillna(value='Nfn', inplace=True)
test['GarageFinish'].fillna(value='Nfn', inplace=True)

In [None]:
# Now we find all values in GarageType
train.GarageType.value_counts(dropna=False)

In [None]:
test.GarageType.value_counts(dropna=False)

In [None]:
# Same as above we create a new category called Nogarage
train['GarageType'].fillna(value='Nogarage', inplace=True)
test['GarageType'].fillna(value='Nogarage', inplace=True)

In [None]:
train.GarageCond.value_counts(dropna=False)

In [None]:
test.GarageCond.value_counts(dropna=False)

In [None]:
train['GarageCond'].fillna(value='NG', inplace=True)
test['GarageCond'].fillna(value='NG', inplace=True)

In [None]:
# Now to see the remaining features and their missing value percentage 
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/train.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

In [None]:
total = test.isnull().sum().sort_values(ascending=False)
percent = ((test.isnull().sum()/test.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

In [None]:
train.GarageQual.value_counts(dropna=False)

In [None]:
test.GarageQual.value_counts(dropna=False)

In [None]:
# Filling missing data in both train and test with NG
train['GarageQual'].fillna(value='NG', inplace=True)
test['GarageQual'].fillna(value='NG', inplace=True)

In [None]:
train.GarageYrBlt.nunique(dropna=False)

In [None]:
# As we can see in the test GarageYrBlt there are outliers  
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 10))
train.GarageYrBlt.hist(ax=ax[0])
ax[0].set_title('Train GarageYrBlt', fontsize=15)
test.GarageYrBlt.hist(ax=ax[1])
ax[1].set_title('Test GarageYrBlt', fontsize=15)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 10))
train.YearBuilt.hist(ax=ax[0])
test.YearBuilt.hist(ax=ax[1])

ax[0].set_title('Train YearBuilt', fontsize=15)
ax[1].set_title('Test YearBuilt', fontsize=15)
plt.tight_layout()

In [None]:
# As we can see that they are similar 
train[['GarageYrBlt', 'YearBuilt']]

In [None]:
# Also in the test dataset
test[['GarageYrBlt', 'YearBuilt']]

In [None]:
# So we can fill the missing values with it's corresponding YearBuilt data
train['YearBuilt'][train.GarageYrBlt.isnull() == True]

In [None]:
# Filling the GarageYrBlt missing values with it's corresponding YearBuilt values 
train.GarageYrBlt.fillna(value=train['YearBuilt'][train.GarageYrBlt.isnull() == True], inplace=True)
test.GarageYrBlt.fillna(value=test['YearBuilt'][test.GarageYrBlt.isnull() == True], inplace=True)

In [None]:
# Now let's see the remaining missing values in the train data 
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/train.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
# Now let's see the remaining missing values in the test data 
total = test.isnull().sum().sort_values(ascending=False)
percent = ((test.isnull().sum()/test.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
train.BsmtFinType2.nunique(dropna=False)

In [None]:
train.BsmtFinType2.value_counts(dropna=False)

In [None]:
test.BsmtFinType2.value_counts(dropna=False)

In [None]:
train[['BsmtFinType2', 'BsmtFinSF2']]

In [None]:
print(train.BsmtFinType2.value_counts(dropna=False))
print('*' * 40)
print(test.BsmtFinType2.value_counts(dropna=False))

In [None]:
# Filling the BsmrFinType2 with it's mode
train.BsmtFinType2.fillna(train.BsmtFinType2.mode()[0], inplace=True)
test.BsmtFinType2.fillna(test.BsmtFinType2.mode()[0], inplace=True)

In [None]:
train['BsmtFinSF1'][train.BsmtFinType1.isnull() == True].head(8)

In [None]:
print(train.BsmtFinType1.value_counts(dropna=False))
print('*' * 40)
print(test.BsmtFinType1.value_counts(dropna=False))

In [None]:
train.BsmtFinType1.mode()

In [None]:
# Same as before we will fill the missing data with the mode
train.BsmtFinType1.fillna(train.BsmtFinType1.mode()[0], inplace=True)
test.BsmtFinType1.fillna(test.BsmtFinType1.mode()[0], inplace=True)

In [None]:
# We now understand that all basement features depend on each other 
# So fill the rest with their mode
train.BsmtExposure.fillna(value=train.BsmtExposure.mode()[0], inplace=True) 
train.BsmtQual.fillna(value=train.BsmtQual.mode()[0], inplace=True) 
train.BsmtCond.fillna(value=train.BsmtCond.mode()[0], inplace=True)

# Same as for testing data
test.BsmtExposure.fillna(value=test.BsmtExposure.mode()[0], inplace=True) 
test.BsmtQual.fillna(value=test.BsmtQual.mode()[0], inplace=True) 
test.BsmtCond.fillna(value=test.BsmtCond.mode()[0], inplace=True)

In [None]:
# Now let's see what is still missing  
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/train.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

In [None]:
total = test.isnull().sum().sort_values(ascending=False)
percent = ((test.isnull().sum()/test.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

In [None]:
train.FireplaceQu.nunique(dropna=False)

In [None]:
# What are the values for FireplaceQu feature
print(train.FireplaceQu.value_counts(dropna=False))
print('*' * 40)
print(test.FireplaceQu.value_counts(dropna=False))

In [None]:
# As we can see here the FireplaceQu values are missing when Fireplaces value is equal to 0
train[['Fireplaces', 'FireplaceQu']].head(20)

In [None]:
# Filling the missing data with NG which we created 
train['FireplaceQu'].fillna(value='NG', inplace=True)
test['FireplaceQu'].fillna(value='NG', inplace=True)

In [None]:
# What are the values of MasVnrType
train.MasVnrType.value_counts(dropna=False)

In [None]:
# What are the values of MasVnrArea
train.MasVnrArea.value_counts(dropna=False).head(20)

In [None]:
# We need to find the relation between MasVnrType & MasVnrArea
# As there are null values in both of them we can't get any information like the others before 
train['MasVnrType'][train['MasVnrArea'].isnull() == True]

In [None]:
# Drop the remaining missing rows 
train.dropna(inplace=True)
test.dropna(inplace=True)

In [None]:
# Now let's see our data if we missed anything by accident   
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/train.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

In [None]:
# As we can see we finished cleaning the training dataset but the testing still needs a little bit more cleaning
# Now let's see our data if we missed anything by accident   
total = test.isnull().sum().sort_values(ascending=False)
percent = ((test.isnull().sum()/test.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

In [None]:
(train.shape, test.shape)

# Time for some EDA (Exploratory Data Analysis)

Using pearson correlation heatmap won't be useful as the data is large

In [None]:
# Using correlation heatmap
plt.figure(figsize=(17, 10))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')

In [None]:
train.corr()['SalePrice'].sort_values(ascending=False)[:11]

In [None]:
train.OverallQual.value_counts(dropna=False)

In [None]:
plt.figure(figsize=(17, 10))
sns.countplot(x='OverallQual', data=train)

In [None]:
plt.figure(figsize=(17, 10))
sns.barplot(x='OverallQual', y='SalePrice', data=train)

In [None]:
train.GrLivArea.nunique(dropna=False)

In [None]:
plt.figure(figsize=(15, 10))
plt.scatter(x=train.GrLivArea, y=train.SalePrice, edgecolors="black")

In [None]:
sns.lmplot(x='GrLivArea', y='SalePrice',data=train, size=10)

In [None]:
train.GarageCars.nunique(dropna=False)

In [None]:
train.GarageCars.value_counts(dropna=False)

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(x='GarageCars', data=train)

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x='GarageCars', y='SalePrice', data=train)

In [None]:
plt.figure(figsize=(15, 10))
sns.barplot(x='GarageCars', y='SalePrice', data=train)

In [None]:
train.GarageArea.nunique()

In [None]:
columns = ['MSZoning', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

In [None]:
final_df = pd.concat([train, test], axis=0)

In [None]:
final_df.shape

In [None]:
def One_hot_encoding(columns):
    df_final=final_df
    i=0
    for fields in columns:
        df1=pd.get_dummies(final_df[fields],drop_first=True)
        
        final_df.drop([fields],axis=1,inplace=True)
        if i==0:
            df_final=df1.copy()
        else:           
            df_final=pd.concat([df_final,df1],axis=1)
        i=i+1
       
        
    df_final=pd.concat([final_df,df_final],axis=1)
        
    return df_final

In [None]:
final_df = One_hot_encoding(columns)

In [None]:
final_df.shape

In [None]:
final_df =final_df.loc[:,~final_df.columns.duplicated()]

In [None]:
final_df.shape

In [None]:
df_Train=final_df.iloc[:1422,:]
df_Test=final_df.iloc[1422:,:]

In [None]:
df_Train.shape

In [None]:
df_Test.shape

In [None]:
df_Test.drop(['SalePrice'],axis=1,inplace=True)

In [None]:
X_train=df_Train.drop(['SalePrice'],axis=1)
y_train=df_Train['SalePrice']

In [None]:
X_train.shape

## Using GradientBoosting with RandomizedSearch 

In [None]:
num_estimators = [500,1000]
learn_rates = [0.02, 0.05]
max_depths = [1, 2]
min_samples_leaf = [5,10]
min_samples_split = [5,10]

param_grid = {'n_estimators': num_estimators,
              'learning_rate': learn_rates,
              'max_depth': max_depths,
              'min_samples_leaf': min_samples_leaf,
              'min_samples_split': min_samples_split}

random_search =RandomizedSearchCV(GradientBoostingRegressor(loss='huber'), param_grid, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

random_search.fit(X_train, y_train)

In [None]:
# Best params
random_search.best_params_

In [None]:
# Train the model 
random_search.fit(X_train, y_train)

In [None]:
# Accuracy for training data
gboost_score=random_search.score(X_train,y_train)
print(f'{round(gboost_score * 100, 2)}%')

In [None]:
# Predictions
pred = random_search.predict(df_Test)

In [None]:
pred

In [None]:
pred_df=pd.DataFrame(pred)
sample = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
data= pd.concat([sample['Id'],pred_df], axis=1)
data.columns=['Id','SalePrice']
data.to_csv('sample_submission1.csv',index=False)

In [None]:
data.tail()

In [None]:
data.head()

In [None]:
data.shape