# Import Necessary Libraries


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Import DataSet

In [None]:
df=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

# OverView

In [None]:
df.head()

In [None]:
df.corr()['SalePrice'].sort_values()

In [None]:
sns.scatterplot(data=df,x='OverallQual',y='SalePrice')
plt.axhline(y=300000 , color='g')

# Scatterplot for GrLivArea & SalePrice

In [None]:
sns.scatterplot(data=df , x='GrLivArea',y='SalePrice')
plt.axhline(y=200000, color='black')
plt.axvline(x=4000, color='red')

In [None]:
df[(df['GrLivArea']>4000)&(df['SalePrice']<200000)][['SalePrice','GrLivArea']]

# Drop Outliers

In [None]:
index_drop=df[(df['GrLivArea']>4000)&(df['SalePrice']<200000)].index

In [None]:
index_drop

In [None]:
df=df.drop(index_drop,axis=0)

In [None]:
sns.scatterplot(data=df , x='GrLivArea',y='SalePrice')
plt.axhline(y=200000, color='black')
plt.axvline(x=4000, color='red')

In [None]:
sns.scatterplot(data=df,x='OverallQual',y='SalePrice')
plt.axhline(y=300000 , color='g')

In [None]:
sns.boxplot(data=df,x='OverallQual',y='SalePrice')

In [None]:
with open('../input/house-prices-advanced-regression-techniques/data_description.txt','r') as f:
    print(f.read())

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df=df.drop('Id',axis=1)

# Find Missing Data

In [None]:
((df.isnull().sum())/len(df))*100

In [None]:
def missing_percent(df):
    nan_percent=((df.isnull().sum())/len(df))*100
    nan_percent=nan_percent[nan_percent>0].sort_values()
    return nan_percent

In [None]:
nan_percent=missing_percent(df)

In [None]:
nan_percent

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90);

In [None]:
df[df['Electrical'].isnull()]

In [None]:
df[df['MasVnrType'].isnull()]

In [None]:
df=df.dropna(subset=['Electrical' , 'MasVnrType'],axis=0)

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90);

In [None]:
df[df['BsmtFinType1'].isnull()]

In [None]:
df[df['BsmtQual'].isnull()]

In [None]:
df[df['BsmtCond'].isnull()]

In [None]:
bsmt_num_cols=['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
df[bsmt_num_cols]=df[bsmt_num_cols].fillna(0)

In [None]:
bsmt_str_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']
df[bsmt_str_cols]=df[bsmt_str_cols].fillna('None')

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90);

In [None]:
df['GarageYrBlt']=df['GarageYrBlt'].fillna(0)

In [None]:
grg_str_cols=['GarageType','GarageFinish','GarageQual','GarageCond']
df[grg_str_cols]=df[grg_str_cols].fillna('None')

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90);

In [None]:
df=df.drop(['Fence','Alley','MiscFeature','PoolQC'], axis=1)

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90);

In [None]:
df['FireplaceQu']=df['FireplaceQu'].fillna('None')

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(8,12))
sns.boxplot(data=df , x='LotFrontage', y='Neighborhood')

In [None]:
df.groupby('Neighborhood')['LotFrontage'].mean()

In [None]:
df.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))

In [None]:
nan_percent=missing_percent(df)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index , y=nan_percent)
plt.xticks(rotation=90)

In [None]:
df['LotFrontage']=df['LotFrontage'].fillna(0)

In [None]:
nan_percent=missing_percent(df)

In [None]:
nan_percent

# Numerical Columns to Categorical

In [None]:
df.info()

In [None]:
df['MSSubClass']

In [None]:
#Convert to String:
df['MSSubClass']= df['MSSubClass'].apply(str)

In [None]:
df.info()
#or: df['MS SubClass'].dtype

# Creating Dummy Variables

In [None]:
df.select_dtypes(include='object')

# Number & Object Sepratation

In [None]:
df_num=df.select_dtypes(exclude='object')
df_obj=df.select_dtypes(include='object')
df_num.info()

In [None]:
df_obj.info()

In [None]:
df_obj=pd.get_dummies(df_obj,drop_first=True)

In [None]:
df_obj.shape

In [None]:
df_num.shape

In [None]:
Final_df=pd.concat([df_num,df_obj],axis=1)
Final_df.shape

In [None]:
Final_df.head()

# Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNetCV

In [None]:
x = Final_df.drop('SalePrice', axis = 1)
y = Final_df['SalePrice'] 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)
model= LinearRegression()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

In [None]:
MAE=metrics.mean_absolute_error(y_test,y_pred)
MSE=metrics.mean_squared_error(y_test,y_pred)
RMSE=np.sqrt(MSE)

In [None]:
pd.DataFrame(data=[MAE,MSE,RMSE],index=["MAE","MSE","RMSE"],columns=["LinearRegression"])


# Regularization

In [None]:
#Ridge Regression
ridge_model=Ridge(alpha=10)

In [None]:
ridge_model.fit(X_train,y_train)

In [None]:
y_pred=ridge_model.predict(X_test)

In [None]:
MAE_new=metrics.mean_absolute_error(y_test,y_pred)
MSE_new=metrics.mean_squared_error(y_test,y_pred)
RMSE_new=np.sqrt(MSE_new)

In [None]:
data={'LinearRegression':[MAE,MSE,RMSE],"Ridge regression":[MAE_new,MSE_new,RMSE_new]}
pd.DataFrame(data,index=["MAE","MSE","RMSE"])

In [None]:
ridge_cv_model=RidgeCV(alphas=(0.1, 4.0, 10.0),scoring="neg_mean_absolute_error")

In [None]:
ridge_cv_model.fit(X_train,y_train)

In [None]:
ridge_cv_model.alpha_

In [None]:
y_pred_ridge=ridge_cv_model.predict(X_test)

In [None]:
MAE_n1=metrics.mean_absolute_error(y_test,y_pred_ridge)
MSE_n1=metrics.mean_squared_error(y_test,y_pred_ridge)
RMSE_n1=np.sqrt(MSE_n1)

In [None]:
data={'LinearRegression':[MAE,MSE,RMSE],"Ridge regression":[MAE_new,MSE_new,RMSE_new],"Ridgecv":[MAE_n1,MSE_n1,RMSE_n1]}
pd.DataFrame(data,index=["MAE","MSE","RMSE"])

In [None]:
 #Lasso regression
lasso_cv_model=LassoCV(eps=0.1,n_alphas=100,cv=5) 
lasso_cv_model.fit(X_train,y_train)

In [None]:
lasso_cv_model.alpha_

In [None]:
y_pred_lasso=lasso_cv_model.predict(X_test)

In [None]:
MAE_n2=metrics.mean_absolute_error(y_test,y_pred_lasso)
MSE_n2=metrics.mean_squared_error(y_test,y_pred_lasso)
RMSE_n2=np.sqrt(MSE_n2)
data={'LinearRegression':[MAE,MSE,RMSE],"Ridge regression":[MAE_new,MSE_new,RMSE_new],"Ridgecv":[MAE_n1,MSE_n1,RMSE_n1],"Lassocv":[MAE_n2,MSE_n2,RMSE_n2]}
pd.DataFrame(data,index=["MAE","MSE","RMSE"])

In [None]:
#Elastic Net
elastic_model=ElasticNetCV(l1_ratio=[0.1,0.5,0.7,0.9,0.95,0.99,1],cv=5,max_iter=100000)

In [None]:
elastic_model.fit(X_train,y_train)
elastic_model.l1_ratio_

In [None]:
y_pred_elastic=elastic_model.predict(X_test)

In [None]:
MAE_n3=metrics.mean_absolute_error(y_test,y_pred_elastic)
MSE_n3=metrics.mean_squared_error(y_test,y_pred_elastic)
RMSE_n3=np.sqrt(MSE_n3)

In [None]:
data={'LinearRegression':[MAE,MSE,RMSE],"Ridge regression":[MAE_new,MSE_new,RMSE_new],"Ridgecv":[MAE_n1,MSE_n1,RMSE_n1],"Lassocv":[MAE_n2,MSE_n2,RMSE_n2],"elasticnet":[MAE_n3,MSE_n3,RMSE_n3]}
pd.DataFrame(data,index=["MAE","MSE","RMSE"])