# Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Import DataSet

In [None]:
df=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

# OverView

In [None]:
df.head()

In [None]:
df.corr()['SalePrice'].sort_values()

In [None]:
sns.scatterplot(data=df,x='OverallQual',y='SalePrice')
plt.axhline(y=300000 , color='r')

# Specify the Noise Data Row

In [None]:
df[(df['OverallQual']>9)&(df['SalePrice']<300000)][['SalePrice','OverallQual']]

# Scatterplot for GrLivArea & SalePrice

In [None]:
sns.scatterplot(data=df , x='GrLivArea',y='SalePrice')
plt.axhline(y=200000, color='g')
plt.axvline(x=4000, color='r')

In [None]:
df[(df['GrLivArea']>4000)&(df['SalePrice']<200000)][['SalePrice','GrLivArea']]

# Drop Outliers

In [None]:
index_drop=df[(df['GrLivArea']>4000)&(df['SalePrice']<200000)].index

In [None]:
index_drop

In [None]:
df=df.drop(index_drop,axis=0)

In [None]:
sns.scatterplot(data=df , x='GrLivArea',y='SalePrice')
plt.axhline(y=200000, color='g')
plt.axvline(x=4000, color='r')

In [None]:
sns.scatterplot(data=df,x='OverallQual',y='SalePrice')
plt.axhline(y=300000 , color='r')

In [None]:
sns.boxplot(data=df,x='OverallQual',y='SalePrice')

# Load Txt File

In [None]:
with open('../input/house-prices-advanced-regression-techniques/data_description.txt','r') as f:
    print(f.read())

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df=df.drop('Id',axis=1)

# Find Missing Data

In [None]:
((df.isnull().sum())/len(df))*100

In [None]:
def missing_percent(df):
    nan_percent=((df.isnull().sum())/len(df))*100
    nan_percent=nan_percent[nan_percent>0].sort_values()
    return nan_percent

In [None]:
nan_percent=missing_percent(df)

In [None]:
nan_percent

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

plt.ylim(0,1)

In [None]:
df[df['Electrical'].isnull()]

In [None]:
df[df['MasVnrType'].isnull()]

In [None]:
df=df.dropna(subset=['Electrical' , 'MasVnrType'],axis=0)

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

plt.ylim(0,1)

In [None]:
df[df['BsmtFinType1'].isnull()]

In [None]:
df[df['BsmtQual'].isnull()]

In [None]:
df[df['BsmtCond'].isnull()]

In [None]:
bsmt_num_cols=['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
df[bsmt_num_cols]=df[bsmt_num_cols].fillna(0)

In [None]:
bsmt_str_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']
df[bsmt_str_cols]=df[bsmt_str_cols].fillna('None')

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

plt.ylim(0,1)

In [None]:
df['GarageYrBlt']=df['GarageYrBlt'].fillna(0)

In [None]:
grg_str_cols=['GarageType','GarageFinish','GarageQual','GarageCond']
df[grg_str_cols]=df[grg_str_cols].fillna('None')

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

In [None]:
df=df.drop(['Fence','Alley','MiscFeature','PoolQC'], axis=1)

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

In [None]:
df['FireplaceQu']=df['FireplaceQu'].fillna('None')

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(8,12))
sns.boxplot(data=df , x='LotFrontage', y='Neighborhood')

In [None]:
df.groupby('Neighborhood')['LotFrontage'].mean()

In [None]:
df.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

In [None]:
df['LotFrontage']=df['LotFrontage'].fillna(0)

In [None]:
nan_percent=missing_percent(df)

In [None]:
nan_percent

# Onehot encoding

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['MSSubClass']=df['MSSubClass'].apply(str)

In [None]:
df.info()

In [None]:
df.select_dtypes(include='object')

# Number & Object Sepratation

In [None]:
df_num=df.select_dtypes(exclude='object')
df_obj=df.select_dtypes(include='object')

In [None]:
df_num.info()

In [None]:
df_obj=pd.get_dummies(df_obj,drop_first=True)

In [None]:
df_obj.shape

In [None]:
df_num.shape

In [None]:
Final_df=pd.concat([df_num,df_obj],axis=1)

In [None]:
Final_df.shape