# House Prices Exploratory Data Analysis
Dataset available here :- https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

FileNotFoundError: [Errno 2] File b'./train.csv' does not exist: b'./train.csv'

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.drop("Id", axis=1, inplace=True)
test_data.drop("Id", axis=1, inplace=True)

In [None]:
train_data.shape, test_data.shape

In [None]:
categorical = train_data.select_dtypes(["object"]).columns

In [None]:
numerical = train_data.select_dtypes(["float64", "int64"]).columns

## Data Analysis
The exploratory data analysis is to summarize the main characteristics of a dataset to understand what the data can tell beyond the formal modeling or hypothesis testing 

In [None]:
#Correlation matrix for all features
corr = train_data.corr()

sns.heatmap(corr, square=True)

In [None]:
indx = corr.nlargest(10, "SalePrice")["SalePrice"].index

In [None]:
indx

In [None]:
# Correlation of 10 largest numerical features
corr1 = train_data[numerical][indx].corr()

In [None]:
sns.heatmap(corr1, annot=True, square=True)
plt.show()

In [None]:
# Finding out features which have maximum correlation with SalePrice in descending order
indx = indx[1:]

In [None]:
indx

## Relationship with categorical variables

In [None]:
# Exploring the feature which has max correlation
feature = "OverallQual"
df = pd.concat([train_data["SalePrice"], train_data[feature]], axis=1)

In [None]:
df.head()

In [None]:
# Categorical (Ordinal)
# We can see a gradual upward trend as expected with increasing OverallQual of the house, the 50th %ile SalePrice is increasing
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="OverallQual", y= "SalePrice")
plt.show()

In [None]:
data = pd.concat([train_data['SalePrice'], train_data['YearBuilt']], axis=1)
f, ax = plt.subplots(figsize=(20, 8))
fig = sns.boxplot(x="YearBuilt", y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

#'SalePrice' is more prone to spend more money in new stuff than in old even though the relationship is not that strong

## Relationship with numerical variables

In [None]:
# Numerical Continuous
# Exploring the Stats for second largest correlated feature (Above Ground living area)
train_data["GrLivArea"].describe()

In [None]:
# We see a linear trend in SalePrice with Above area
sns.jointplot(x=train_data["GrLivArea"], y=train_data["SalePrice"], kind="reg")
plt.show()

In [None]:
# Finding and removing outliers based on the above plot
ix = train_data[(train_data["GrLivArea"] > 4300) & (train_data["SalePrice"] < 220000)].index

In [None]:
ix

In [None]:
train_data = train_data.drop(ix)

In [None]:
train_data = train_data.reset_index(drop=True)

In [None]:
# Cleaned Points which stay true to the nature of the curve
sns.jointplot(x=train_data["GrLivArea"], y=train_data["SalePrice"], kind="reg")
plt.show()

In [None]:
# Numerical Discrete
# Exploring the third largest correlated feature (count of Cars that can fit in house's Garage)
train_data["GarageCars"].describe()

In [None]:
train_data["GarageCars"].unique()

In [None]:
# Analysing the box plot for GarageCars and Sales Price
# We see few outlier points that don't follow the trend
sns.boxplot(x=train_data["GarageCars"], y= train_data["SalePrice"])
plt.show()

In [None]:
# Finding out outliers
ix = train_data[(train_data["GarageCars"] > 3) & (train_data["SalePrice"] < 300000)].index

In [None]:
ix

In [None]:
# Removing outliers
train_data = train_data.drop(ix).reset_index(drop=True)

In [None]:
train_data.head()

In [None]:
# Cleaner Boxplot
sns.boxplot(x=train_data["GarageCars"], y= train_data["SalePrice"])
plt.show()

In [None]:
train_data.isnull().head()

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(train_data.isnull(), xticklabels=train_data.columns)
plt.show()

In [None]:
train_data.info()
# Finding out that Alley is one of the features having maximum Null Values. However, on inspection from the data_description.txt, we find that Alley like many other features has NA meaning as No Alley([Feature_name])

In [None]:
test_data.info()