In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Importing The Training Dataset
df = pd.read_csv('../input/house-price-prediction-dataset/train.csv')
# Displaying all the columns
pd.set_option('Display.max_columns', 81)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Getting The Feature Names
df.columns

# Data Preprocessing And EDA

In data preprocessing section, the dataset may have contained some errors such as missing values, outliers etc.
This dataset does not have any duplicate values. Data preprocessing is an operation for making the dataset errorless
for EDA.

### Missing values

In [None]:
pd.set_option('display.max_rows', None)
print(df.isna().sum())

Many null values are there in the dataset therefore we have to find the relation between null values and SalesPrice.
Below used diagrams are showing the relationship of null values and SalesPrice.

### Numerical Variable

In [None]:
numerical_features = [features for features in df.columns if df[features].dtypes != object]
print('Number Of Numerical Variables : ', len(numerical_features))
df[numerical_features].head()

### Date-Time Variables / Temporal Variables

In [None]:
# 'Yr' and "Year" are there in all the year variables
year_features = [features for features in numerical_features if 'Yr' in features or 'Year' in features]
year_features

We can get number of years by the difference of YearBuilt and YrSold

In [None]:
# Relation between YrSold and SalePrice
plt.style.use('dark_background')
df.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year Sold')
plt.ylabel('Median House Price')
plt.title('Year Sold VS House Price')
plt.tight_layout()
plt.show()

In [None]:
# Now we'll see the difference between Year Sold And All the year variables.
for feature in year_features :
    if feature != 'YrSold':
        df[feature] = df['YrSold'] - df[feature]
        plt.scatter(df[feature], df['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('Sale Price')
        plt.tight_layout()
        plt.show()

In [None]:
# Discrete Variable
discrete_features = [feature for feature in numerical_features if len(df[feature].unique())<25 and feature not in year_features + [['ID']]]
print('Number Of Discrete Features : ', len(discrete_features))
df[discrete_features].head()

In [None]:
# Relationship Between Discrete Features And Sale Price
for feature in discrete_features :
    df.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('Sale Price')
    plt.title(feature)
    plt.tight_layout()
    plt.show()

We can see that there is a relationship between number of variables and SalePrice

In [None]:
# Continuous Variable
continuous_features = [feature for feature in numerical_features if feature not in discrete_features +year_features +['Id']]
print('Number Of Discrete Features : ', len(continuous_features))
df[continuous_features].head()

In [None]:
# Analyzing The Relationship Of Continuous Variable By Histogram
for feature in continuous_features :
    plt.hist(df[feature], bins = 25)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.title(feature)
    plt.tight_layout()
    plt.show()

### Outliers

In [None]:
for feature in continuous_features:
    if 0 in df[feature].unique():
        pass
    else:
        plt.style.use('default')
        df[feature] = np.log(df[feature])
        df.boxplot(column = feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.tight_layout()
        plt.show()

In [None]:
# Categorical Variables
categorical_features = [feature for feature in df.columns if df[feature].dtypes == object]
for feature in categorical_features:
    print('In {} number of categories are {}'.format(feature, len(df[feature].unique())))
df[categorical_features].head()

In [None]:
# Analyzing The Relationship Of Categorical Variable
for feature in categorical_features :
    plt.style.use('dark_background')
    df.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.tight_layout()
    plt.show()

# Feature Engineering

In [None]:
# Replacing null values of categorical feature with a new label
cat_na_features = [feature for feature in df.columns if df[feature].isna().sum()>1 and df[feature].dtypes == 'O']

df[cat_na_features] = df[cat_na_features].fillna('Missing')

In [None]:
df.head()

In [None]:
# Dealing with missing values in numerical variables
num_na_features = [feature for feature in df.columns if df[feature].isna().sum()>1 and df[feature].dtypes != object]

for feature in num_na_features:
    
    # Replacing it by median values because there are outliers
    median_value = df[feature].median()
    
    # Creating a new feature for capturing nan values
    df[feature + 'nan'] = np.where(df[feature].isna(), 1,0)
    df[feature].fillna(median_value, inplace = True)

In [None]:
df.head(50)

# Handling Rare Categorical Feature

Removing categorical features which are less than 1% of the observations.

In [None]:
categorical_features

In [None]:
for feature in categorical_features :
    temp = df.groupby(feature)['SalePrice'].count() / len(df)
    temp_df = temp[temp > 0.01].index
    df[feature] = np.where(df[feature].isin(temp_df), df[feature],'Rare_var')

In [None]:
df.head(100)

In [None]:
# Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

enc_features = ['MSZoning','Street','Alley','LotShape','LandContour','Utilities',
                'LotConfig','LandSlope','Neighborhood',
                'Condition1','Condition2','BldgType','HouseStyle','RoofStyle',
                'RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual',
                'ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure',
                'BsmtFinType1','BsmtFinType2','Heating','HeatingQC',
                'CentralAir','Electrical','KitchenQual','Functional',
                'FireplaceQu','GarageType','GarageFinish','GarageQual',
                'GarageCond','PavedDrive','PoolQC','Fence','MiscFeature',
                'SaleType','SaleCondition']
for feature in enc_features :
    df[feature] = le.fit_transform(df[feature])

df.head(10)

# Feature Scaling

In [None]:
feature_scale = [feature for feature in df.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df[feature_scale])

In [None]:
scaler.transform(df[feature_scale])

# Feature Selection

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df, df['SalePrice'], test_size = 0.1, random_state = 0)  
X_train.drop(['Id','SalePrice'], axis =1)
X_test.drop(['Id','SalePrice'], axis =1)