In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
df=train_df.copy()

In [None]:
pd.set_option('display.max_columns',500)
df.head()

In [None]:
df.info()

### So many columns!! OK, take a deep breath..

# EDA (Understanding data)

Lets first visualize the missing values

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.isna(),annot=False,cbar=False)
plt.show()

In [None]:
# % missing values
print("% missing values\n\n\n")
for column in df.columns:
    print(f'{column} : {round((df[column].isna().sum())/(len(df[column]))*100,2)} %')

In [None]:
for column in df.columns:  # printing the no. of unique values in each column
    print(column,': ' ,len(df[column].unique()))

In [None]:
plt.figure(figsize=(30,30))
sns.heatmap(df.drop('SalePrice',axis=1).corr(),annot=True,cbar=False)
plt.show()

In [None]:
# Seperating categorical and numerical variables

In [None]:
categorical_var=[column for column in df.columns if df[column].dtype== 'object']

In [None]:
print(categorical_var)

In [None]:
numerical_var=df.columns.drop(categorical_var)

In [None]:
numerical_var

# EDA

In [None]:
for column in numerical_var:
    sns.histplot(df[column])
    plt.show()

In [None]:
for column in categorical_var:
    plt.figure(figsize=(8,5))
    sns.countplot(df[column])
    plt.xticks(rotation='90')
    plt.show()

In [None]:
#outliers check
for column in numerical_var:
    sns.boxplot(df[column])
    plt.show()

# Correlation of numerical features with target variable

In [None]:
corr_df=pd.DataFrame(df[numerical_var].corrwith(df['SalePrice']),columns=['corr'])

In [None]:
corr_df

In [None]:
plt.figure(figsize=(8,20))
sns.heatmap(corr_df,annot=True)

# Data cleaning

### Dropping the continuous features having correlation near to zero with the target variable

In [None]:
to_drop=corr_df[(corr_df['corr']<0.15) & (corr_df['corr']>-0.15)].index
to_drop # columns having correlation near to zero

In [None]:
df.drop(to_drop,axis=1,inplace=True)

### Handling missing values

As we analyzed during EDA , we have column with following missing values:
Alley : 93.77 %,
LotFrontage : 17.74 %,
MasVnrType : 0.55 %,
MasVnrArea : 0.55 %,
BsmtQual : 2.53 %,
BsmtCond : 2.53 %,
BsmtExposure : 2.6 %,
BsmtFinType1 : 2.53 %,
BsmtFinType2 : 2.6 %,
Electrical : 0.07 %,
FireplaceQu : 47.26 %,
GarageType : 5.55 %,
GarageYrBlt : 5.55 %,
GarageFinish : 5.55 %,
GarageQual : 5.55 %,
GarageCond : 5.55 %,
PoolQC : 99.52 %,
Fence : 80.75 %
MiscFeature : 96.3 %

We are going to drop Alley, FirePlaceQu , PoolQC, Fence and MiscFeature columns as these have too many missing values

In [None]:
#dropping the above columns
df.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature','LotFrontage'],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.isna(),annot=False,cbar=False)
plt.show()

### Since we have less data (less no. of rows), instead of deleting the rows , we are going to use fillna method

In [None]:
from sklearn.impute import SimpleImputer
for column in df.columns:
    if df[column].dtype=='object':
        si1=SimpleImputer(strategy='most_frequent')
        df[column]=si1.fit_transform(df[[column]])
#         df[column].fillna(df[column].mode,inplace=True)
    else:
        si2=SimpleImputer(strategy='median')
        df[column]=si2.fit_transform(df[[column]])
#         df[column].fillna(df[column].median,inplace=True)

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.isna(),annot=False,cbar=False)
plt.show()

### All missing values handled

In [None]:
df.head()

# Removing outliers

In [None]:
# making a list of continuous variable columns

In [None]:
continuous_var=[column for column in df.columns if len(df[column].unique())>50]

In [None]:
for column in continuous_var:
    sns.boxplot(df[column])
    plt.show()

In [None]:
# removing outliers using inter quartile range
for column in continuous_var:
    first=df[column].quantile(0.25)
    third=df[column].quantile(0.75)
    iqr=third-first
    upper=third+1.5*iqr
    lower=first-1.5*iqr
    df=df[(df[column]>=lower) & (df[column]<=upper)]

In [None]:
for column in continuous_var:
    sns.boxplot(df[column])
    plt.show()

### Hurray !! outliers removed from all the continouos columns

In [None]:
#remaining columns
print('Total no. of columns are:',len(df.columns))
print(df.columns)

# To be continued......
### any suggestion is welcome