# **House Prices EDA & Regression**

## **Exploratory Data Analysis (EDA)**

#### Loading and Displaying data

In [None]:
import pandas as pd

# Not limiting the column number when displaying dataframe
pd.set_option("display.max_columns", None)

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df.head()

In [None]:
df.tail()

#### Names of columns

In [None]:
print(df.columns)

#### Shape of the train dataset

In [None]:
rows = df.shape[0]
cols = df.shape[1]

print("Before cleaning, there are " + str(rows) + " rows and " + str(cols) + " columns in this dataframe.")

#### Any duplicates?

In [None]:
dupRows = df.duplicated().sum()
print("There are " + str(dupRows) + " duplicated rows in the dataframe.")

#### Any null values? If yes, how many in which column?

In [None]:
df.isnull().sum()

There are multiple null values in the column **Lot Frontage**.

#### Sum of unique values per column

In [None]:
df.nunique()

#### Data types of the columns

In [None]:
df.info()

#### Sums of columns with same data type

In [None]:
df.dtypes.value_counts()

Most columns are of type **object**, followed by **integer** and **float**.

#### Statistics for each column

In [None]:
df.describe()

#### Boxplots for interesting columns

In [None]:
import numpy as np
import matplotlib.pyplot as plt

boxplot_columns = ["LotArea", "OverallQual", "OverallCond", "YearBuilt", "FullBath", "Fireplaces", "GarageCars", "SalePrice"]


for entry in boxplot_columns:
    fig1, axes = plt.subplots(figsize = (10, 8))
    axes.set_title(entry)
    axes.boxplot(df[entry]);

#### How much memory does each column need (in bytes)?

In [None]:
df.memory_usage()

#### How much do the columns correlate with each other?

The **corr()**-function is used to find the pairwise correlation of all columns in the dataframe. Any missing values are automatically excluded. For any non-numeric data type columns in the dataframe, it is ignored. This function comes in handy while we doing the Feature Selection by observing the correlation between features and target variable or between variables.

In [None]:
df.corr()

#### Visualizing the correlation table above

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlations = df.corr()

plt.figure(figsize = (26, 26))
plt.title("Heatmap displaying the correlations between all columns", fontsize = 20)
sns.heatmap(correlations, annot = True, cmap = "mako")

#### How much are the columns correlation with our target/pred column **SalePrice**?

In [None]:
survivedCorr = df.corr()["SalePrice"]
survivedCorr = pd.DataFrame(survivedCorr)
survivedCorr

In [None]:
%matplotlib inline

plt.figure(figsize = (50, 20))
plt.title("Correlations between input columns and target column 'SalePrice'", fontsize = 20)
plt.xlabel("Columns", fontsize = 16)
plt.ylabel("Correlation factor", fontsize = 16)
plt.plot(survivedCorr, color = "purple", linestyle = "", marker = "o")
plt.show()

We can see that (not really supringsing) parameters like the construction year or overall living space influence the price strongly while other factors don't.

#### How many houses lay in what price range?

In [None]:
prices = df.value_counts(["SalePrice"])
prices

In [None]:
priceRange = []

for value in df["SalePrice"]:
    if (value <= 100000):
        priceRange.append(1) # Cheap
    elif (value > 100000) & (value <= 300000):
        priceRange.append(2) # Middle Class
    elif (value > 300000) & (value <= 600000):
        priceRange.append(3) # High Income
    elif (value > 600000) & (value <= 1000000):
        priceRange.append(4) # Very High Income
    elif (value > 1000000):
        priceRange.append(5) # Luxury
    
df["priceRange"] = priceRange

df.head()

#### How old are the houses?

In [None]:
ages = df.value_counts(["YearBuilt"])
ages

In [None]:
age = []

for value in df["YearBuilt"]:
    if (value <= 1900):
        age.append(1) # 19th Century
    elif (value > 1900) & (value <= 1930):
        age.append(2) # Early 20th Century
    elif (value > 1930) & (value <= 1960):
        age.append(3) # Mid 20th Century
    elif (value > 1960) & (value <= 1990):
        age.append(4) # Late 20th Century
    elif (value > 1990):
        age.append(5) # Modern

df["age"] = age

df.head()

#### Equipment

In [None]:
equipment = df.value_counts(["Utilities"])
equipment

In [None]:
equipment = []

for value in df["Utilities"]:
    if (value == "AllPub"):
        equipment.append(1)
    else:
        equipment.append(2)

df["equipment"] = equipment

df.head()

#### Quality of the Exterior Material

In [None]:
qualex = df.value_counts(["ExterQual"])
qualex

In [None]:
qualex = []

for value in df["ExterQual"]:
    if (value == "Po"):
        qualex.append(1)
    elif (value == "Fa"):
        qualex.append(2)
    elif (value == "TA"):
        qualex.append(3)
    elif (value == "Gd"):
        qualex.append(4)
    elif (value == "Ex"):
        qualex.append(5)
    else:
        qualex.append(0)

df["qualex"] = qualex

df.head()

#### Exterior Condition

In [None]:
condex = df.value_counts(["ExterCond"])
condex

In [None]:
condex = []

for value in df["ExterCond"]:
    if (value == "Po"):
        condex.append(1)
    elif (value == "Fa"):
        condex.append(2)
    elif (value == "TA"):
        condex.append(3)
    elif (value == "Gd"):
        condex.append(4)
    elif (value == "Ex"):
        condex.append(5)
    else:
        condex.append(0)

df["condex"] = condex

df.head()

#### How many stories?

In [None]:
stories = df.value_counts(["HouseStyle"])
stories

In [None]:
size = []

for value in df["HouseStyle"]:
    if (value == "1Story"):
        size.append(1) # One story
    elif (value == "1.5Fin"):
        size.append(2) # One and one-half story: 2nd level finished
    elif (value == "1.5Unf"):
        size.append(3) # One and one-half story: 2nd level unfinished
    elif (value == "2Story"):
        size.append(4) # Two Story
    elif (value == "2.5Fin"):
        size.append(5) # Two and one-half story: 2nd level finished
    elif (value == "2.5Unf"):
        size.append(6) # Two and one-half story: 2nd level unfinished
    elif (value == "SFoyer"):
        size.append(7) # Split Foyer
    elif (value == "SLv1"):
        size.append(8) # Split Level
    else:
        size.append(0)
    
df["size"] = size

df.head()

#### NAs & Encoding

In [None]:
df = df.fillna(0)
df = df.replace("Y", 1)
df = df.replace("N", 0)
df = df.replace("P", 0.5)

df.head()

#### Relation between House Price and Age of House

In [None]:
pd.crosstab(df["priceRange"], df["age"])

In [None]:
ct = pd.crosstab(df["priceRange"], df["age"]) 

plt.figure(figsize = (18, 18))
plt.title("Crosstab showing how the Construction Year influences the Price", fontsize = 20)
sns.heatmap(ct, cmap = "YlGnBu", annot = True, cbar = True, fmt = "g")

#### Relation between House Price and Size of House

In [None]:
pd.crosstab(df["priceRange"], df["size"])

In [None]:
ct = pd.crosstab(df["priceRange"], df["size"]) 

plt.figure(figsize = (18, 18))
plt.title("Crosstab showing how the House Size influences the Price", fontsize = 20)
sns.heatmap(ct, cmap = "BuPu", annot = True, cbar = True, fmt = "g")

#### Are older Houses bigger/smaller than newer ones?

In [None]:
pd.crosstab(df["age"], df["size"])

In [None]:
ct = pd.crosstab(df["age"], df["size"]) 

plt.figure(figsize = (18, 18))
plt.title("Crosstab showing how the House Age influences its Size", fontsize = 20)
sns.heatmap(ct, cmap = "Greens", annot = True, cbar = True, fmt = "g")

### Interactive Data Science Charts

In [None]:
pip install pandas-profiling[notebook]

In [None]:
import pandas_profiling
from pandas_profiling import ProfileReport

profile = ProfileReport(df, title = "Pandas Profiling Report", explorative = True)

profile.to_widgets()

## **Machine Learning**

#### ***Multiple Linear Regression***

#### One-Hot-Encoding

In [None]:
df = pd.get_dummies(df)
df.head()

#### Defining X and y and Splitting data

In [None]:
X = df.drop(["SalePrice"], axis = 1).values
y = df["SalePrice"].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.15)

#### Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)