# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [None]:
df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
print('The size of trainsets:    {:,}'.format(df_train.size))
print('The shape of trainsets: ', df_train.shape)
print('The size of testsets:    {:,}'.format(df_test.size))
print('The shape of testsets: ', df_test.shape)

# Data Discovery

In [None]:
df_train.head()

* SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
* MSSubClass: The building class
* MSZoning: The general zoning classification
* LotFrontage: Linear feet of street connected to property
* LotArea: Lot size in square feet
* Street: Type of road access
* Alley: Type of alley access
* LotShape: General shape of property
* LandContour: Flatness of the property
* Utilities: Type of utilities available
* LotConfig: Lot configuration
* LandSlope: Slope of property
* Neighborhood: Physical locations within Ames city limits
* Condition1: Proximity to main road or railroad
* Condition2: Proximity to main road or railroad (if a second is present)
* BldgType: Type of dwelling
* HouseStyle: Style of dwelling
* OverallQual: Overall material and finish quality
* OverallCond: Overall condition rating
* YearBuilt: Original construction date
* YearRemodAdd: Remodel date
* RoofStyle: Type of roof
* RoofMatl: Roof material
* Exterior1st: Exterior covering on house
* Exterior2nd: Exterior covering on house (if more than one material)
* MasVnrType: Masonry veneer type
* MasVnrArea: Masonry veneer area in square feet
* ExterQual: Exterior material quality
* ExterCond: Present condition of the material on the exterior
* Foundation: Type of foundation
* BsmtQual: Height of the basement
* BsmtCond: General condition of the basement
* BsmtExposure: Walkout or garden level basement walls
* BsmtFinType1: Quality of basement finished area
* BsmtFinSF1: Type 1 finished square feet
* BsmtFinType2: Quality of second finished area (if present)
* BsmtFinSF2: Type 2 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
* Heating: Type of heating
* HeatingQC: Heating quality and condition
* CentralAir: Central air conditioning
* Electrical: Electrical system
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* Bedroom: Number of bedrooms above basement level
* Kitchen: Number of kitchens
* KitchenQual: Kitchen quality
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
* Functional: Home functionality rating
* Fireplaces: Number of fireplaces
* FireplaceQu: Fireplace quality
* GarageType: Garage location
* GarageYrBlt: Year garage was built
* GarageFinish: Interior finish of the garage
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet
* GarageQual: Garage quality
* GarageCond: Garage condition
* PavedDrive: Paved driveway
* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet
* PoolQC: Pool quality
* Fence: Fence quality
* MiscFeature: Miscellaneous feature not covered in other categories
* MiscVal: $Value of miscellaneous feature
* MoSold: Month Sold
* YrSold: Year Sold
* SaleType: Type of sale
* SaleCondition: Condition of sale

### Checking the missing values

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = ((df_train.isnull().sum()/df_train.isnull().count())*100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total Missed', 'Percent of missing in %'])
missing_data.head(30)

In [None]:
df_test.isnull().sum()
total_test = df_test.isnull().sum().sort_values(ascending=False)
percent_t = ((df_test.isnull().sum()/df_test.isnull().count())*100).sort_values(ascending=False)
missing_data_t = pd.concat([total_test, percent_t], axis=1, keys=['Total Missed', 'Percent of missing in %'])
missing_data_t.head(30)

In [None]:
df_train.SalePrice.describe()

In [None]:
df_train.SalePrice.plot.hist(bins=50,color = "skyblue", ec="skyblue")

Most of the density of sale's price lies between 100k and 200k.

### Check the Corelation with our target 

In [None]:
corr = df_train.corr()[['SalePrice']].abs()
corr.style.background_gradient(cmap='coolwarm')

We will try to take just the features which the corelation is higher than 0.50.

In [None]:
corr1 = df_train.corr()[['SalePrice']].abs()
corr1 = corr1 > 0.51
corr1.loc[corr1["SalePrice"] == True]

In [None]:
print("The size of corelation data :", corr1.loc[corr1["SalePrice"] == True].shape)

### Handling missing values

In [None]:
df_test = df_test.fillna(df_test.mean())

# Exploratory Data Analysis

### The relationship between SalesPrice and other variables

With the use of pairplot we will be able to see the relationship between our features variables which the correlation (linear corelation) is higer than 0.5 and the target variable.
Here we had 9 features to use for training our model:

In [None]:
columns = ['SalePrice', '1stFlrSF','TotalBsmtSF','GarageCars','GarageArea','OverallQual','GrLivArea','YearBuilt','FullBath']
sns.pairplot(df_train[columns], height=1.5)

In [None]:
df_train.YrSold.value_counts()

In [None]:
df_train.YrSold.plot.hist()

In [None]:
ax = sns.boxplot(x="OverallQual", y="SalePrice", data=df_train, palette="Set2")

In [None]:
df_features = ['1stFlrSF','TotalBsmtSF','GarageCars','GarageArea','OverallQual','GrLivArea','YearBuilt','FullBath']
for i in df_features:
    df_train.plot.scatter(i,'SalePrice')

# Modeling

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import xgboost

### Random Forest Regressor

In [None]:
Y_train = df_train.SalePrice

# Create training predictors data
X_train = df_train[df_features]

my_model = RandomForestRegressor()
my_model.fit(X_train, Y_train)

In [None]:
x_test = df_test[df_features]

predicted_prices = my_model.predict(x_test)

print(predicted_prices)

### Linear Regression

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)

In [None]:
predicted_prices_regr = regr.predict(x_test)
print(predicted_prices_regr)

### XGBoost Regression

In [None]:
classifier = xgboost.XGBRegressor()
classifier.fit(X_train,Y_train)
predicted_prices_xgb = my_model.predict(x_test)

In [None]:
my_submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': predicted_prices_regr})
my_submission.to_csv('submission.csv', index=False)