In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Data fields

Here's a brief version of what you'll find in the data description file.

* **SalePrice** - the property's sale price in dollars. This is the target variable that you're trying to predict.
* **MSSubClass**: The building class
* **MSZoning**: The general zoning classification
* **LotFrontage**: Linear feet of street connected to property
* **LotArea**: Lot size in square feet
* **Street**: Type of road access
* **Alley**: Type of alley access
* **LotShape**: General shape of property
* **LandContour**: Flatness of the property
* **Utilities**: Type of utilities available
* **LotConfig**: Lot configuration
* **LandSlope**: Slope of property
* **Neighborhood**: Physical locations within Ames city limits
* **Condition1**: Proximity to main road or railroad
* **Condition2**: Proximity to main road or railroad (if a second is present)
* **BldgType**: Type of dwelling
* **HouseStyle**: Style of dwelling
* **OverallQual**: Overall material and finish quality
* **OverallCond**: Overall condition rating
* **YearBuilt**: Original construction date
* **YearRemodAdd**: Remodel date
* **RoofStyle**: Type of roof
* **RoofMatl**: Roof material
* **Exterior1st**: Exterior covering on house
* Exterior2nd: Exterior covering on house (if more than one material)
* MasVnrType: Masonry veneer type
* MasVnrArea: Masonry veneer area in square feet
* ExterQual: Exterior material quality
* ExterCond: Present condition of the material on the exterior
* Foundation: Type of foundation
* BsmtQual: Height of the basement
* BsmtCond: General condition of the basement
* BsmtExposure: Walkout or garden level basement walls
* BsmtFinType1: Quality of basement finished area
* BsmtFinSF1: Type 1 finished square feet
* BsmtFinType2: Quality of second finished area (if present)
* BsmtFinSF2: Type 2 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
* Heating: Type of heating
* HeatingQC: Heating quality and condition
* CentralAir: Central air conditioning
* Electrical: Electrical system
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* Bedroom: Number of bedrooms above basement level
* Kitchen: Number of kitchens
* KitchenQual: Kitchen quality
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
* Functional: Home functionality rating
* Fireplaces: Number of fireplaces
* FireplaceQu: Fireplace quality
* GarageType: Garage location
* GarageYrBlt: Year garage was built
* GarageFinish: Interior finish of the garage
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet
* GarageQual: Garage quality
* GarageCond: Garage condition
* PavedDrive: Paved driveway
* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet
* PoolQC: Pool quality
* Fence: Fence quality
* MiscFeature: Miscellaneous feature not covered in other categories
* MiscVal: $Value of miscellaneous feature
* MoSold: Month Sold
* YrSold: Year Sold
* SaleType: Type of sale
* SaleCondition: Condition of sale

In [None]:
train.head()

In [None]:
train_df = train.copy()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

### Checking for categorical features

In [None]:
# Checking for categorical features

categorical_col = []
for column in train.columns:
    if train[column].dtype == object and len(train[column].unique()) <= 50:
        categorical_col.append(column)
        print(f"{column} : {train[column].unique()}")
        print("====================================")

### Checking for numerical features

In [None]:
numerical_col = []
for column in train.columns:
    if train[column].dtype != object and len(train[column].unique()) <= 50:
        numerical_col.append(column)
        print(f"{column} : {train[column].unique()}")
        print("====================================")

In [None]:
# Visulazing the distibution of the data for every feature
train.hist(edgecolor='black', linewidth=1.2, figsize=(20, 20));

### Visualizing the missing values with the help of heatmap.

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(train.isnull(), cmap='viridis')

In [None]:
# Extracting the columns which have missing values from the dataset
missing_values = [feature for feature in train.columns if train[feature].isnull().sum() >1]
print("The features having the missing values are",missing_values,end='')

In [None]:
for feature in missing_values:
    print(feature, 'has', np.round(train[feature].isnull().mean(),2), '% of missing values')

* We can see that our dataset contains lot of missing values so we need to handle them accordingly

### Visualizing the Dependent feature

In [None]:
sns.distplot(train['SalePrice'])

* We can see that our dependent feature is slightly right skewed this can affect the accuracy of our model so we need to convert it to normal distribution.

In [None]:
train['SalePrice'] = np.log(train['SalePrice'] + 1)
sns.distplot(train['SalePrice'])

* We have normalized our dependent feature into Gaussian Distribution to fit our model properly

### Converting all the categorical columns into numerical

In [None]:
print(categorical_col,end='')

* Now if the percentage is greater than 0.015 then only i am going to take the categorical feature otherwise i am going to say it as rare variable ('Rare_var')

In [None]:
for feature in categorical_col:
    temp = train.groupby(feature)['SalePrice'].count()/len(train) #Calculating the percentage
    temp_df = temp[temp>0.01].index
    train[feature] = np.where(train[feature].isin(temp_df), train[feature], 'Rare_var')


In [None]:
train.head()

#### Label Encoding the categorical features

* **Label Encoding** : Label encoder basically converts categorical values into numerical values

In [None]:
# Label encoder basically converts categorical values into numerical values

from sklearn.preprocessing import LabelEncoder

sc=LabelEncoder()

for feature in categorical_col:

    train[feature]=sc.fit_transform(train[feature])

In [None]:
train.head()

In [None]:
for feature in missing_values:
    print(feature, 'has', np.round(train[feature].isnull().mean(),2), '% of missing values')

### Filling the missing values

In [None]:
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train['GarageYrBlt'].mean())

In [None]:
train.head()

### Feature Selection

* In our dataset excluding the dependent feature we have 80 indenpendent feature If we consider all the 80 columns as our independent feature our model accuracy will decrease, as the number of features increases the accuracy decreases this is called as the **Curse Of Dimentionality**

* In order to solve this problem there are several ways to eliminate this problem like PCA, dropping the useless columns etc.

* But in our case we will use a library under sklearn called as **Extra Tree Regressor**, what it does is that it returns use only those features which are important for model building, prediction and the features which helps us it increase the accuracy of the model.

* Feature importance gives you a score for each feature of your data, the higher the score the more important or relevant is the feature towards your output variable

* Feature importance is an in built class that comes with Tree Based Regressor, we will be using Extra Tree Regressor for extracting the top 10 features for the dataset


In [None]:
# Splitting the features into independent and dependent variables

x = train.drop(['SalePrice'], axis = 1)
y = train['SalePrice']

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

model = ExtraTreesRegressor()
model.fit(x,y)

In [None]:
print(model.feature_importances_)

In [None]:
#plotting graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

In [None]:
sns.distplot(train['SalePrice'])

### Building the model

In [None]:
#Spliting data into test and train

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20)

### Applying Linear Regression Algorithm

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(x_train, y_train)

lr_pred = lr.predict(x_test)

In [None]:
r2 = r2_score(y_test,lr_pred)
print('R-Square Score: ',r2*100)

In [None]:
# Calculate the absolute errors
lr_errors = abs(lr_pred - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(lr_pred), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (lr_errors / y_test)
# Calculate and display accuracy
lr_accuracy = 100 - np.mean(mape)
print('Accuracy for Logistic Regression is :', round(lr_accuracy, 2), '%.')

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

print('mse:',metrics.mean_squared_error(y_test, lr_pred))
print('mae:',metrics.mean_absolute_error(y_test, lr_pred))

In [None]:
sns.distplot(y_test-lr_pred)

In [None]:
# plotting the Linear Regression values predicated Rating

plt.figure(figsize=(12,7))

plt.scatter(y_test,x_test.iloc[:,2],color="blue")
plt.title("True rate vs Predicted rate",size=20,pad=15)
plt.xlabel('Sale Price',size = 15)
plt.scatter(lr_pred,x_test.iloc[:,2],color="yellow")

### Applying Decision tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtree = DecisionTreeRegressor(criterion='mse')
dtree.fit(x_train, y_train)

In [None]:
dtree_pred = dtree.predict(x_test)

In [None]:
r2 = r2_score(y_test,dtree_pred)
print('R-Square Score: ',r2*100)

# Calculate the absolute errors
dtree_errors = abs(dtree_pred - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(dtree_pred), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (dtree_errors / y_test)
# Calculate and display accuracy
dtree_accuracy = 100 - np.mean(mape)
print('Accuracy for Decision tree regressor is :', round(dtree_accuracy, 2), '%.')

In [None]:
#plotting the Decision Tree values predicated Rating

plt.figure(figsize=(12,7))

plt.scatter(y_test,x_test.iloc[:,2],color="blue")
plt.title("True rate vs Predicted rate",size=20,pad=15)
plt.xlabel('Sale Price',size = 15)
plt.scatter(dtree_pred,x_test.iloc[:,2],color="yellow")
plt.legend()

### Applying Random Forest Regressor Algorithm

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_regressor = RandomForestRegressor()
random_forest_regressor.fit(x_train, y_train)
rf_pred = random_forest_regressor.predict(x_test)

In [None]:
r2 = r2_score(y_test,rf_pred)
print('R-Square Score: ',r2*100)

# Calculate the absolute errors
rf_errors = abs(rf_pred - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(rf_pred), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (rf_errors / y_test)
# Calculate and display accuracy
rf_accuracy = 100 - np.mean(mape)
print('Accuracy for random forest regressor is :', round(rf_accuracy, 2), '%.')


In [None]:
#plotting the Random forest values predicated Rating

plt.figure(figsize=(12,7))

plt.scatter(y_test,x_test.iloc[:,2],color="blue")
plt.title("True rate vs Predicted rate",size=20,pad=15)
plt.xlabel('Sale Price',size = 15)
plt.scatter(rf_pred,x_test.iloc[:,2],color="yellow")

In [None]:
pred_y = (lr_pred*0.45 + dtree_pred*0.55 + rf_pred*0.65)

In [None]:
pred_y