In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import numpy as np
import scipy.stats as spy

# 1. Exploratory Data Analysis

In [None]:

train=pd.read_csv('../input/house-prices-data/train.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.dtypes

In [None]:
train.isnull().sum()

In [None]:
print(train.shape)

In [None]:
train.info

## Lets Start with EDA

    *Total there are 81 columns & 1460 rows
    *Sale Price is the Y variable
    *ID is the randomly generated number, we can ignore the column.
    *We have to work with remaining 79 variables

In [None]:
#distribution of the target variable "sale price"
f,ax=plt.subplots(figsize=(18,8))
sns.distplot(train['SalePrice'],color='r')

plt.figure(figsize=(16,5))
train.SalePrice.plot(kind="box",vert=False,color='g')
plt.title("Sales Price value distribution")
plt.xlabel("Sales Price")

plt.show()

The distribution looks skewed and we can see lot of outliers in the distribution. These outliers may decrease the accuracy of the model. Hence we need to remove them, to remove the outliers using IQR method.

To do that, we need find Q1,Q3 and IQR. 

In [None]:
#lets write the function to identify the outliers

def outlier(datacolumn):
    sorted(datacolumn)
    Q1,Q3=np.percentile(datacolumn,[25,75])
    IQR=Q3-Q1
    lower_range=Q1-(3*IQR)
    upper_range=Q3+(3*IQR)
    return lower_range,upper_range, Q1,Q3,IQR

In [None]:
lowerbound,upperbound, q1,q3,iqr=outlier(train.SalePrice)
print(lowerbound,upperbound,q1,q3,iqr)

In [None]:
train[(train.SalePrice < lowerbound) | (train.SalePrice > upperbound)]

In [None]:
#train.drop(train[(train.SalePrice > upperbound) |(train.SalePrice < lowerbound) ].index, inplace=True)

In [None]:
#before we drop the outliers, lets identify skewness and kurtosis of the distribution

print(f'Skewness of sale price is {round(train.SalePrice.skew(),2)}')
print(f'Kurtosis of sale price is {round(train.SalePrice.kurt(),2)}')

### Observations:

* The distribution is positively skewed
* Kurtosis is more than 3, hence it a Platykurtic, this implies the curve is flat. Kurtosis value should be near to zero then we can say the data are normally distributed.

In order to get the normal distribution, we will use Log Transformation method to achieve the normal curve.

In [None]:
logsaleprice=np.log(train.SalePrice.values)

plt.figure(figsize=(20,10))
sns.distplot(logsaleprice,color='r')
plt.title("Log Sales Price value distribution")
plt.xlabel("Sales Price")

plt.show()



In [None]:
logsalepricedf=pd.DataFrame(logsaleprice)

plt.figure(figsize=(20,10))
logsalepricedf.plot(kind="box",vert=False,color='g')
plt.title("Log Sales Price value distribution")
plt.xlabel("Sales Price")

plt.show()

In [None]:
#lets find the skewness of the log transformed salesprice
print(f'Skewness of log transformed sale price is {round(logsalepricedf.skew(),2)}')
print(f'Kurtosis of log transformed sale price is {round(logsalepricedf.kurt(),2)}')

As you can see the, new skewness & Kurtosis is almost near to zero, which means log transformed saleprice follow normal distribution

In [None]:
#lets look at the Sale price data
train['SalePrice'].describe()

### Observations:

1. There are total 1460 sales
2. Average value is 180921
3. Cheapest house sold: $34900

4. The most expensive house sold: $755000

In [None]:
# Now we will see how the other variables are related to Price.
# Lets delete the ID column, since it is of no use

train.drop('Id', axis=1)

In [None]:
trcorr=train.drop('Id', axis=1).select_dtypes(include='number').corr()

plt.figure(figsize=(18,8))
trcorr["SalePrice"].sort_values(ascending=True)[:-1].plot(kind="barh")
plt.title("Correlation of Numerical variables to SalePrice")
plt.xlabel("Correlation to SalePrice")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(18,10))
sns.heatmap(trcorr,annot=False, cmap="Blues")
plt.title("Correlation of Numerical Variables with Price")

In [None]:
#lets analyse each variable with Salesprice
#Start with most correlated variable "OverallQual"

plt.figure(figsize=(16,6))
train.groupby("OverallQual")['SalePrice'].count().plot(kind='bar')
plt.title('Distribution sales price on over all quality')
plt.show()

plt.figure(figsize=(16,6))
sns.boxplot(x='OverallQual',y='SalePrice', data=train)
plt.title('OverallQual vs SalePrice')
plt.show()

### Observations:
1. Price increases with increase in overall quality index.
2. You can see there are more sales with ratings 5 to 7 in overall quality index and sales decreases with increase in price. Less number of expensive houses are sold.
3. There very few sales on 1-3 & 8-10, it means we can some outliers here.

Lets plot for other coorelated variables

In [None]:
#GrLivArea

plt.figure(figsize=(16,6))
sns.scatterplot(x='GrLivArea',y='SalePrice', hue='OverallQual', data=train, legend='full')
plt.title('GrLivArea vs SalePrice')
plt.show()

Observations:
As you see salesprice increases with living area, you can also see some outliers in the higher price and more living area. we will try to remove them.

Since GrLivArea is continous variable, lets convert it to categorical variable by applying binning concept.
 use pd.cut() function and then combine saleprice & GrLivarea


In [None]:
train_bin=pd.DataFrame(pd.cut(train.GrLivArea, bins=10, labels=np.arange(0,10)))
train_concat=pd.concat([train_bin,train.SalePrice], axis=1)

plt.figure(figsize=(16,6))
train_concat.groupby('GrLivArea').SalePrice.count().plot(kind='bar')
plt.title("Count of observations in living area (binned values)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()



If we do not take the log of the GrLivArea then distribution is skewed, hence to achieve the normal distribution we will log transform it

In [None]:
train_bin=pd.DataFrame(pd.cut(np.log(train.GrLivArea), bins=10, labels=np.arange(0,10)))
train_concat=pd.concat([train_bin,train.SalePrice], axis=1)

plt.figure(figsize=(16,6))
train_concat.groupby('GrLivArea').SalePrice.count().plot(kind='bar')
plt.title("Count of observations in living area (binned values)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

plt.figure(figsize=(16,6))
sns.boxplot(x='GrLivArea',y='SalePrice', data=train_concat)
plt.title('GrLivArea vs SalePrice')
plt.show()


Now you see, the values are normally distributed and we can easily identify the outliers.


In [None]:
#plotting the other features using for loop
list(train.select_dtypes(include='number'))

In [None]:
features=['TotalBsmtSF','LotArea','GarageArea','1stFlrSF']
for i in features:
    plt.figure(figsize=(16,6))
    sns.scatterplot(x=i,y='SalePrice', hue='OverallQual', data=train, legend='full')
    plt.title(f'{i} vs SalePrice')
    plt.show()

In [None]:
#now lets look at time related features

list(train.columns)

In [None]:
# 'YrSold','MoSold','YearBuilt' are time related features, lets try to get some insights

plt.figure(figsize=(16,6))
train.groupby("YrSold")['SalePrice'].count().plot(kind='bar')
plt.title('Sales over the years')
plt.show()

plt.figure(figsize=(16,6))
sns.boxplot(x='YrSold', y='SalePrice', data=train)
plt.title("Distribution of Sales over years")
plt.show()

import calendar
month_names=calendar.month_name[1:13]

plt.figure(figsize=(16,6))
train.groupby("MoSold")['SalePrice'].count().plot(kind='bar')
plt.title('Sales in different Months')
plt.xticks(ticks=np.arange(0, 12), labels=month_names, rotation=45)
plt.ylabel('SalePrice')
plt.show()

plt.figure(figsize=(16,6))
sns.boxplot(x='MoSold', y='SalePrice', data=train)
plt.title("Sales in different Months")
plt.xticks(ticks=np.arange(0, 12), labels=month_names, rotation=45)
plt.ylabel('SalePrice')
plt.show()

Observations:
1. Fewer sales in 2010, might due to economic solwdown in 2009
2. 2006-2008 have steady sales not much variance.
3. Most sales in the month of may, june & july and maky be due to summer hoidays in US.

In [None]:
#plot the year built

plt.figure(figsize=(16,6))
train.groupby("YearBuilt")['SalePrice'].count().plot(kind='bar')
plt.title('Observation counts of built year of the houses')
plt.show()

plt.figure(figsize=(16,6))
sns.boxplot(x='YearBuilt', y='SalePrice', data=train)
plt.title("Built Year vs SalePrice")
plt.show()

In [None]:
#above visuals have too much clutter, its difficult to analyse. Lets bin the observations to simplify

decades=np.arange(1870, 2015,10)
yrbuilt_bin = pd.cut(train.YearBuilt, bins=decades, labels=decades[:-1])
yrb_comb = pd.concat([yrbuilt_bin,train.SalePrice],axis=1)
                         
#df_cut = pd.cut(df.YearBuilt, bins=decades, labels=decades[:-1])
#df_comb = pd.concat([df_cut, df.SalePrice], axis=1)
                         
plt.figure(figsize=(16,6))
yrb_comb.groupby("YearBuilt").SalePrice.count().plot(kind='bar')
plt.title('Observation counts of built year of the houses')
plt.xlabel('Built Year')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(16,6))
sns.boxplot(x='YearBuilt', y='SalePrice', data=yrb_comb)
plt.title("Built Year vs SalePrice")
plt.show()


### Observations:
1. Most of the houses were built in 1950 or later and in year 2000 highest number of houses were built.
2. Houses built in 1990 or later yield higher mean of the sale price
3. Houses built in 1890 or earlier have unusual variance and is likely to be outliers


In [None]:
#lets calculate the age of the property & analyse the sales
train['age']=train['YrSold']- train['YearBuilt']
print(f"Oldest property sold: {train['age'].max()} Yrs")
print(f"Most new property sold: {train['age'].min()} Yrs")

In [None]:
#train_age=pd.concat([train['Age'],train['SalePrice']], axis=1)
decades=np.arange(0,136,10)
age_bin = pd.cut(train.age, bins=decades, labels=decades[:-1])
age_comb = pd.concat([age_bin,train.SalePrice],axis=1)

plt.figure(figsize=(16,6))
age_comb.groupby("age").SalePrice.count().plot(kind='bar')
plt.title('Observation counts of property age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(16,6))
sns.boxplot(x='age', y='SalePrice', data=age_comb)
plt.title("Age vs SalePrice")
plt.show()

### Observations:
1. More newly built properties were sold compared to old ones.
2. the mean sale price of the properties upto 10yrs old doesnt show much variance.
3. Properties more than 100yrs old show huge variance, it means these can be outliers, which we will remove in later stages.

In [None]:
#we will visualise some of the categorical variables 
#BldgType: type of the building
#Neighborhood
#Utilities
#SaleType
#MSSubClass: the building class
#SaleCondition: Condition of the sale

In [None]:
feat=['BldgType','Neighborhood','Utilities','SaleType','MSSubClass','SaleCondition']

for i in feat:
    plt.figure(figsize=(16,6))
    train.groupby(i).SalePrice.count().plot(kind='bar')
    plt.title(f'Observation counts of the {i}')
    plt.xlabel(f'{i}')
    plt.ylabel('Count')
    plt.show()   
    
    plt.figure(figsize=(16,6))
    sns.boxplot(x=i, y='SalePrice', data=train)
    plt.title(f"{i} vs SalePrice")
    plt.show()




# 2. Data Cleaning and formatting

In this Step, we will
1. Fix the missing values
2. Set correct data types
3. Remove the outliers

##### We will start by plotting the numerical data according to current pandas data types

In [None]:
#freshly load the train & test data
df_train=pd.read_csv('../input/house-prices-data/train.csv')
df_test=pd.read_csv('../input/house-prices-data/test.csv')


#we will concatenate the features of both data sets to find the missing values

df_comb = pd.concat([df_train, df_test]).reset_index(drop=True).copy()
df_comb.isnull().any()

In [None]:
#fig, axes = plt.subplots(nrows=18, ncols=2, figsize=(16,36))
num=df_comb.drop(['Id','SalePrice'],axis=1).select_dtypes('number')
for idx, column in enumerate(num.columns[1:]):
    num[column].plot(kind="hist", bins=100, rwidth=.9, title=column)
    #ax=axes[idx//2, idx%2])
    #ax=axes[idx//2, idx%2].yaxis.label.set_visible(False)

    plt.tight_layout()
    plt.show()

### Observations:
1. There are lot of categorical values which are defines as numerical
2. There are many missing values

Lets see the total missing values

In [None]:
df_comb.isna().sum()

In [None]:
missing = df_comb.columns[df_comb.isna().any()]
print(missing)

In [None]:
#Fix Missing Values in features

#Alley: NA means, there is no Alley. Filling missing values with None
df_comb['Alley']=df_comb['Alley'].fillna("None")
  
    
#LotFrontage: filling missing values with Zero and converting to int
df_comb['LotFrontage']=df_comb['LotFrontage'].fillna(0).astype(int)

#Masonry veneer type: None means there is no Masonry, we will fill with None
df_comb['MasVnrType']=df_comb['MasVnrType'].fillna('None')
df_comb['MasVnrArea']=df_comb['MasVnrArea'].fillna(0).astype(int)


#all Basement features: NA means, no basement. Filling the string variables as 'None'
bsmt=['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for i in bsmt:
    df_comb[i]=df_comb[i].fillna("None")
    
bsmtA=['BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF','BsmtFullBath','BsmtHalfBath']
for i in bsmtA:
    df_comb[i]=df_comb[i].fillna(0).astype(int)

#Electrical: NA means electrical system is unknown, filling with most occuring value "SBrkr"
df_comb['Electrical']=df_comb['Electrical'].fillna("SBrkr")

#FireplaceQu: NA means No fire place, filling with "None"
df_comb['FireplaceQu']=df_comb['FireplaceQu'].fillna('None')

#Garage features: NA means, there is no Garage, filling with 'None'
garage=['GarageType','GarageFinish', 'GarageQual', 'GarageCond']
for i in garage:
    df_comb[i]=df_comb[i].fillna("None")

#Garage Sizes: Filling with 0 and converting to in type
grg=['GarageYrBlt','GarageArea','GarageCars']
for i in grg:
    df_comb[i]=df_comb[i].fillna(0).astype(int)

#'MSZoning': The general zoning classification: filling it with most frequent values "RL"

df_comb['MSZoning']=df_comb['MSZoning'].fillna("RL")
    
#'Exterior1st': Filling with most frequent value 'VinylSd'
df_comb['Exterior1st']=df_comb['Exterior1st'].fillna("VinylSd")

#Exterior2nd:NA means there is no second exterior, filling it with None
df_comb['Exterior2nd']=df_comb['Exterior2nd'].fillna("None")

# 'KitchenQual':NA means unknown, filling with most frequest value "TA"
df_comb['KitchenQual']=df_comb['KitchenQual'].fillna('TA')

#Functional: filling with 'Typ'
df_comb['Functional']=df_comb['Functional'].fillna('Typ')

#'PoolQC': AN means likely no pool
df_comb['PoolQC']=df_comb['PoolQC'].fillna('None')

#'Fence':NA means, likely no fence
df_comb['Fence']=df_comb['Fence'].fillna('None')

#'MiscFeature': NA means likely no other features
df_comb['MiscFeature']=df_comb['MiscFeature'].fillna('None')

#'SaleType': Filling with most frequent values 'WD'
df_comb['SaleType']=df_comb['SaleType'].fillna('WD')

# 'Utilities':filing with frequent value'AllPub'
df_comb['Utilities']=df_comb['Utilities'].fillna('AllPub')

### Now we will encode the categorical values

In [None]:
object_var=df_comb.select_dtypes(include=['object'])
object_var.columns


In [None]:
#Encoding thses columns, we will use label encoder
from sklearn import preprocessing
le=preprocessing.LabelEncoder
encode=le.fit_transform(le, df_comb.Alley)
pd.DataFrame(df_comb.Alley.value_counts().index, pd.Series(encode).value_counts().index)

We cannot use Label encoder, since there is no order for data, we will encode it manually.

