# Predict the sale prices of houses according to the dataset

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# assign constant features for the following plots
plt.rcParams['font.size']=14
plt.rcParams['legend.fontsize']=11
plt.rcParams['font.serif'] = 'Time New Roman'
plt.rcParams['lines.linewidth'] = 2

In [None]:
# Read the dataset in a dataframe
housing_df = pd.read_csv('../input/ames-housing-dataset/Housing_Data.csv')
housing_df.info()

In [None]:
housing_df.head()

In [None]:
#Removing the PID (We already have an index, so we don't need PID unique identifier. )
housing_df.drop('PID' , axis =1 , inplace = True)

**Dealing with outliers**
* The SalePrice is target or y, so the other feature or the other columns are inputs (x)
* First check the correlation between each input with the sale price
* Second plot a figure for most correlated features with the output to find the outliers
* Also chech the distribution plot of the output


In [None]:
# the distribution plot of the sale price
plt.figure(dpi=100)
sns.displot(housing_df['SalePrice'] , kde = True ,color= 'g' )

In [None]:
# correlation between inputs and the output
housing_df.corr()['SalePrice'].sort_values(ascending = False)

In [None]:
# sale price_ ovarall qual(it has the most correlation with sale price)
plt.figure()
sns.scatterplot(data= housing_df, x= 'Overall Qual', y='SalePrice', color='g')
plt.plot(list(range(1,11)), 200000*np.ones(10), color='r')

It can be seen that with the rise of overall qual, the sale price increases, too. 
for the overallqualss equal to 9 and ten there are three datas with high sale price but low overall qual.
they are approximately under the line y=200000. 
they can be considered as as outliers. Lets check them in another way, too.

In [None]:
# check with Gr Liv Area- the second most correlated data with sale price
plt.figure()
sns.scatterplot(data= housing_df, x= 'Gr Liv Area' , y='SalePrice' , color='g')
plt.axhline(y=200000 , color='r')
plt.axvline(x=4000 , color='y')

The three dots below the red line and after the yellow line are outliers.
Lets check are the presented outliers in these plots the same, if yes, lets remove them, as they will make problem to our model:

In [None]:
housing_df[(housing_df['Overall Qual']>= 9)& (housing_df['SalePrice']<200000) ][['SalePrice','Overall Qual', 'Gr Liv Area']]

In [None]:
housing_df[(housing_df['Gr Liv Area']> 4000)& (housing_df['SalePrice']<200000) ][['SalePrice','Overall Qual', 'Gr Liv Area']]

The two result have 3 similar rows that are outliers and should be deleted.

In [None]:
# Remove the outliers
outlier_index = housing_df[(housing_df['Gr Liv Area']> 4000)& (housing_df['SalePrice']<200000) ].index
housing_df.drop(outlier_index,inplace=True)

In [None]:
#Lets check if the problem is solved or not!
plt.figure()
sns.scatterplot(data= housing_df, x= 'Gr Liv Area' , y='SalePrice' , color='g')
plt.axhline(y=200000 , color='r')
plt.axvline(x=4000 , color='y')

plt.figure()
sns.scatterplot(data= housing_df, x= 'Overall Qual', y='SalePrice', color='g')
plt.plot(list(range(1,11)), 200000*np.ones(10), color='r')


**The outliers are now removed**

**Dealing with missing data**

In [None]:
# Lets open the description file to see the explanation of each column and their missing data
with open('../input/ames-housing-dataset/Ames_Housing_Feature_Description.txt','r') as f:
    print(f.read())

In [None]:
# count the missing data in each feature:
housing_df.isnull().sum().sort_values(ascending=False)

In [None]:
# Lets calculate the percentage missing data to understand the better, and save the columns with missing data in a series named nan_percent:
nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()

In [None]:
nan_percent

In [None]:
# plot the columns with missing data(the plot shows the percentage of missing data of each eature)
plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
# pool qc, alley, misc feature, and fence have more than 80% nan. According to the description file and with this many nan
# they can be removed, so now remove the fore columns with more than 80% missing data
threshold = len(housing_df)-(80*len(housing_df))/100
housing_df.dropna(axis =1 , inplace = True , thresh=threshold)
nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()
plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
housing_df.info()

In [None]:
housing_df['Bsmt Qual']

In [None]:
housing_df['BsmtFin SF 1']

In [None]:
# Bsmt means basement
# all the columns with Bsmt, nan means there is no basement, so it can be replaced by none, or if 
# it is numerical it can be replace by 0
numerical_basement= ['BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF','Total Bsmt SF' ,'Bsmt Full Bath', 'Bsmt Half Bath']
string_basements= ['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2']
housing_df[numerical_basement]= housing_df[numerical_basement].fillna(0)
housing_df[string_basements]= housing_df[string_basements].fillna('None')

In [None]:

nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()
plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
# if the building has no masonry, the data of 'Mas Vnr Type' , 'Mas Vnr Area' will be nan
# they can be treated likewise Bsmt columns
housing_df['Mas Vnr Type'].fillna('None' , inplace= True)
housing_df[ 'Mas Vnr Area'].fillna(0, inplace = True)
nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()
plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
# when there is no fire place, its value is missing, so it should be replaced by 'None'
housing_df['Fireplace Qu'].fillna('None', inplace=True)

In [None]:

nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()
plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
# Lets check how many rows are missing within Electrical column
housing_df[housing_df['Electrical'].isnull()]

In [None]:
# there is just one row, so it can be removed
housing_df.dropna(subset=['Electrical'] , inplace= True)

nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()
plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
# Based on the dataset documentation, NaN in Garage Columns seems to indicate no garage.
# Decision: Fill with 'None' or 0
string_garage = ['Garage Type','Garage Finish', 'Garage Qual' , 'Garage Cond' ]
num_garage = ['Garage Cars' , 'Garage Area' , 'Garage Yr Blt']
housing_df[string_garage]=housing_df[string_garage].fillna('None')
housing_df[num_garage]=housing_df[num_garage].fillna(0)
nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()
plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
# LotFrontage: Linear feet of street connected to property
housing_df['Lot Frontage']


In [None]:
#We assume that the Lot Frontage is related to what a Neighborhood a house is in
housing_df['Neighborhood'].unique()

In [None]:
# Lets check their connection with box plot
plt.figure(figsize=(15,6), dpi=100)
sns.boxplot(data=housing_df, x='Neighborhood', y='Lot Frontage')
plt.xticks(rotation=90)

In [None]:
# the amount of lot frontage for same neighborhoods are rather close to each other for most neighborhoods
# replace the missing values with the average values for the same neighborhood
housing_df.groupby('Neighborhood').mean()['Lot Frontage']

In [None]:
housing_df['Lot Frontage']= housing_df.groupby('Neighborhood')['Lot Frontage'].transform(lambda a:a.fillna(a.mean()))
nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()

In [None]:
nan_percent

In [None]:

plt.figure(figsize = (15,5))
sns.barplot(x = nan_percent.index , y = nan_percent)
plt.xticks(rotation=90)

In [None]:
housing_df['Lot Frontage'].fillna(0,inplace=True)
nan_percent = (housing_df.isnull().sum())*100/len(housing_df)
nan_percent = nan_percent[nan_percent>0].sort_values()
nan_percent

**now there is no missing data**

**** We need to be careful when it comes to encoding categorical as numbers. We want to make sure that the numerical relationship makes sense for model.categoricals shouldnt have sequential values as numbers****

In [None]:
housing_df.info()
housing_df['MS SubClass']

In [None]:
#MS SubClass: Identifies the type of dwelling involved in the sale.
# so MS SubClass should be string and categorical
housing_df['MS SubClass']=housing_df['MS SubClass'].apply(str)
housing_df.info()

In [None]:
housing_df['MS SubClass']

**creatin dummy variables**

In [None]:
housing_df.select_dtypes(exclude=object)

In [None]:
df_num = housing_df.select_dtypes(exclude=object)
df_obj = housing_df.select_dtypes(include=object)

In [None]:
df_obj.info()

In [None]:
df_obj.head()

In [None]:
df_obj = pd.get_dummies(df_obj, drop_first=True)
df_obj.info()

In [None]:
df_obj.shape

In [None]:
housing_df_final = pd.concat([df_obj , df_num] , axis=1)

In [None]:
housing_df_final.head()

In [None]:
housing_df_final.corr()['SalePrice'].sort_values(ascending=False).head(7)

In [None]:
# The scatter plot of the 6 variable above and SalePrice
fig , axes = plt.subplots(6, figsize=(10,25))
sns.scatterplot(data= housing_df_final , x= 'SalePrice' , y= 'Overall Qual' , ax=axes[0])
sns.scatterplot(data= housing_df_final , x= 'SalePrice' , y= 'Gr Liv Area' , ax=axes[1])
sns.scatterplot(data= housing_df_final , x= 'SalePrice' , y= 'Total Bsmt SF' , ax=axes[2])
sns.scatterplot(data= housing_df_final , x= 'SalePrice' , y= 'Garage Cars', ax=axes[3])
sns.scatterplot(data= housing_df_final , x= 'SalePrice' , y= '1st Flr SF', ax=axes[4])
sns.scatterplot(data= housing_df_final , x= 'SalePrice' , y= 'Garage Area',ax=axes[5])
plt.tight_layout()

**Linear regression**

In [None]:
# Necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Determine the features and target variables and devide them into train and test dataset
X = housing_df_final.drop('SalePrice', axis=1)
y = housing_df_final['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.3)

# design the model and train it with train dataset
MyModel = LinearRegression()
MyModel.fit(X_train , y_train)

In [None]:
# coefficients of the model
pd.DataFrame(MyModel.coef_ , index=X.columns , columns=['coefficient'])

In [None]:
# evaluating the model
from sklearn import metrics
y_pred = MyModel.predict(X_test)
MAE= metrics.mean_absolute_error(y_test , y_pred)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
pd.DataFrame([[MAE,(MAE*100)/y_test.mean()], [RMSE, (RMSE*100)/ y_test.mean()]] , index=['Mean absolute error', 'Root mean square error'], columns=['Error','percentage of error to average of target (%)'])

**Residuals**

In [None]:
test_residuals = y_test - y_pred
sns.displot(test_residuals)

they have normal distribution with mean = 0

In [None]:
sns.scatterplot(x=y_test,  y=y_pred)

In [None]:
plt.figure()
sns.scatterplot( x=y_test , y=test_residuals)
plt.axhline(y=0,color='r')

Residuals are randomn around 0