# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_object_dtype
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from xgboost import XGBRegressor

from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import StandardScaler,RobustScaler,PolynomialFeatures,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,r2_score
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,ElasticNetCV
from sklearn.ensemble import RandomForestRegressor


Loading Dataset

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
combine = pd.concat([train,test])

In [None]:
#Droping unnecessary columns for dataframes
train.drop('Id',axis=1,inplace=True)
test.drop('Id',axis=1,inplace=True)

# EDA,Visualization and PreProcessing

In [None]:
train.head()

In [None]:
test.head()

In [None]:
test.shape,train.shape

In [None]:
train.describe().T

In [None]:
train['SalePrice']

In [None]:
#Relation between saleprice and other features
correlation_num = train.corr()
correlation_num.sort_values(['SalePrice'], ascending = True, inplace = True)
correlation_num.SalePrice

After checking the correlation of SalePrice with all Features, We can see OverallQual,GrLivArea,GarageCars,GarageArea have the most correlation

We will confirm it by plotting a correlation heatmap below

In [None]:
# Check for Corelation between Features
plt.figure(figsize=(20, 10))
sns.heatmap(train.corr(),yticklabels=True,cbar=True,cmap='ocean')

In [None]:
#Function for printing null_values and related info
def descr(train_num):
    no_rows=train_num.shape[0]
    types=train_num.dtypes
    col_null = train_num.columns[train_num.isna().any()].to_list()
    counts=train_num.apply(lambda x: x.count())
    uniques=train_num.apply(lambda x: x.unique())
    nulls=train_num.apply(lambda x: x.isnull().sum())
    distincts=train_num.apply(lambda x: x.unique().shape[0])
    nan_percent=(train_num.isnull().sum()/no_rows)*100
    cols={'dtypes':types, 'counts':counts, 'distincts':distincts, 'nulls':nulls,  
          'missing_percent':nan_percent, 'uniques':uniques}
    table=pd.DataFrame(data=cols)
    return table


In [None]:
#Checking Null Values In Train
details_tr = descr(train)
details_tr.reset_index(level=[0],inplace =True)
details_tr.sort_values(by='missing_percent', ascending=False)

In [None]:
#Plot for Missing Values in Train dataset
details_tr.sort_values(by='missing_percent', ascending=False,inplace=True)
details_tr = details_tr[details_tr['missing_percent']>0]

plt.figure(figsize=(10,4), dpi = 100)
sns.barplot(x=details_tr['index'],y=details_tr['missing_percent'], data=details_tr)
plt.xticks(rotation=90)
plt.show()

So we can deduce from the above bar graph that PoolQc,MiscFeature,Alley,Fence has the maximum number of NAN values

In [None]:
#Checking Null Values In Train
details_test = descr(test)
details_test.reset_index(level=[0],inplace =True)
details_test.sort_values(by='missing_percent', ascending=False)

In [None]:
train.isnull().values.any()

In [None]:
test.isnull().values.any()

In [None]:
#From above table we know electrical has only 1 missing value so its better to replace nan with mode
train['Electrical'].mode()

# Computing NAN Values From Train and Test


In [None]:
#Filling Nan values according to datatype and category in train dataframe

n = []
c = []
bsmt_str_cols =  ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
bsmt_num_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
for col,col_df in details_tr.iterrows():
    row = col_df['index']
    if col_df['dtypes']=='object':
        c.append(col)
        if row == 'Electrical':
            train[row].fillna('SBrkr',inplace=True)
        elif row =='MasVnrType':
            train[row].fillna('None',inplace=True)
        elif row =='GarageType':
            train[row].fillna('Attchd',inplace=True)
        elif row =='GarageCond':
            train[row].fillna('TA',inplace=True)
        elif row =='GarageFinish':
            train[row].fillna('Unf',inplace=True)
        elif row =='GarageQual':
            train[row].fillna('TA',inplace=True)
        elif row =='FireplaceQu':
            train[row].fillna('None',inplace=True)
        for i in bsmt_str_cols:
            if row ==i:
                train[row].fillna('None',inplace=True)
        else:
            train[row].fillna("NotAvailable",inplace=True)
    else:
        n.append(col)
        if row =='MasVnrArea':
            train[row].fillna(0,inplace=True)
        for i in bsmt_num_cols:
            if row ==i:
                train[row].fillna('None',inplace=True)
        else:
            train[row].fillna(train[row].median(),inplace=True)
    
        
print("\nNumerical Features   -->", len(n))
print("Categorical Features -->", len(c))
        


In [None]:
#Filling Nan values according to datatype and category in test dataframe
nt = []
ct = []
bsmt_str_cols =  ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
bsmt_num_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
for col,col_df in details_test.iterrows():
    row = col_df['index']
    if col_df['dtypes']=='object':
        ct.append(col)
        if row == 'Electrical':
            test[row].fillna('SBrkr',inplace=True)
        elif row =='MasVnrType':
            test[row].fillna('None',inplace=True)
        elif row =='GarageType':
            test[row].fillna('Attchd',inplace=True)
        elif row =='GarageCond':
            test[row].fillna('TA',inplace=True)
        elif row =='GarageFinish':
            test[row].fillna('Unf',inplace=True)
        elif row =='GarageQual':
            test[row].fillna('TA',inplace=True)
        elif row =='FireplaceQu':
            test[row].fillna('None',inplace=True)
        else:
            test[row].fillna("NotAvailable",inplace=True)
        for i in bsmt_str_cols:
            if row ==i:
                test[row].fillna('None',inplace=True)
    
    else:
        nt.append(col)
        if row =='MasVnrArea':
            test[row].fillna(0,inplace=True)
        else:
            test[row].fillna(test[row].median(),inplace=True)
        for i in bsmt_num_cols:
            if row ==i:
                test[row].fillna('None',inplace=True)
       

            
print("\nNumerical Features   -->", len(nt))
print("Categorical Features -->", len(ct))

In [None]:
#Checking if there are any remaining Null Values In Train
details_tr = descr(train)
details_tr.sort_values(by='missing_percent', ascending=False).head()

In [None]:
train.isnull().values.any()

In [None]:
#Checking if there are any remaining Null Values In Train
details_test = descr(test)
details_test.reset_index(level=[0],inplace =True)
details_test.sort_values(by='dtypes', ascending=True).head()

In [None]:
test.isnull().values.any()

In [None]:
# Separating Columns with Numerical Value and Character in 2 dataframes of train,test Datasets
train_num = train.select_dtypes(exclude = 'object')
train_cat = train.select_dtypes(include = 'object')

test_num = test.select_dtypes(exclude = 'object')
test_cat = test.select_dtypes(include = 'object')

Plotting numerical features with SalePrice

In [None]:
#Plotting numerical features with SalePrice
for i in train_num.columns:
    sns.set_style('whitegrid')
    plt.figure(figsize= (10,10))
    x = train_num[i]
    sns.jointplot(x=x, y=train_num['SalePrice'], data = train_num)

# Plotting categorical features with SalePrice


In [None]:
#Plotting categorical features with SalePrice
for i in train_cat.columns:
    sns.set_style('whitegrid')
    plt.figure(figsize= (15,15))
    x = train_cat[i]
    sns.jointplot(x=x, y=train_num['SalePrice'], data = train_cat)

In [None]:
#Prices of Houseprice with years
train.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year Sold')
plt.ylabel('Median House Price')
plt.title("House Price vs YearSold")

We can observe that SalePrice of Houses has been decreasing recently

In [None]:
train_map = train.copy()
test_map = test.copy()

In [None]:
train_map.head()

Mapping the Numerical Features for Model Processing

In [None]:
for feature in train_map.select_dtypes(include = "object"):
    labels_ordered=train_map.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    train_map[feature]=train_map[feature].map(labels_ordered)
    
for feature in test_map.select_dtypes(include = "object"):
    labels_ordered=test_map.groupby([feature])['LotFrontage'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    test_map[feature]=test_map[feature].map(labels_ordered)

In [None]:
test_map.head()

In [None]:
train_map.head()

# Feature Scaling

In [None]:

test_map = test_map.drop(["PoolQC", "MiscFeature", "Alley", "Fence"], axis = 1)
train_map = train_map.drop(["PoolQC", "MiscFeature", "Alley", "Fence"], axis = 1)
X = train_map.drop(["SalePrice"],axis=1).drop(train_map.index[-1])
Y = train_map['SalePrice'].drop(train_map.index[-1])

#Train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=101)


# Standard scaling our data
scaler = StandardScaler()
scaler.fit(X_train) 
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape , Y_train.shape,X_test.shape,Y_test.shape

# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge,RidgeCV
rid_reg = Ridge(alpha = 100)
rid_reg.fit(X_train, Y_train)

Y_pred = rid_reg.predict(X_test)

# testing the model

print("MAE : ",mean_absolute_error(Y_test, Y_pred))
print('R2 SCORE : ',r2_score(Y_test, Y_pred))
print('Score :',rid_reg.score(X_test,Y_test))
print('MSE :',mean_squared_error(Y_test,Y_pred))
print('RMSE :',np.sqrt(mean_squared_error(Y_test,Y_pred)))

**HyperParameterTuning For alpha**

In [None]:
# let's find best values for alpha by crossvalidating
ridgecv = RidgeCV(alphas = (0.01, 400.0), scoring = 'neg_mean_squared_error', normalize = True)
ridgecv.fit(X_train, Y_train)
ridgecv.alpha_

In [None]:
# Create the Ridge model using best alpha value:
from sklearn.linear_model import Ridge,RidgeCV
rid_reg = Ridge(alpha = 0.01)
rid_reg.fit(X_train, Y_train)

Y_pred_ridge = rid_reg.predict(X_test)

# testing the model

ridge_mae = mean_absolute_error(Y_test, Y_pred_ridge)
ridge_r2_score= r2_score(Y_test, Y_pred_ridge)
ridge_rmse = np.sqrt(mean_squared_error(Y_test,Y_pred_ridge))

print("MAE for Ridge : ",ridge_mae)
print('R2 SCORE for Ridge: ',ridge_r2_score)
print('Score for Ridge:',rid_reg.score(X_test,Y_test))
print("MSE for Ridge : ",mean_squared_error(Y_test,Y_pred_ridge))
print("RMSE for Ridge : ",ridge_rmse)

In [None]:
Y_pred_ridge.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = Y_pred_ridge,y = Y_test,color = 'springgreen');

# Lasso Regression

In [None]:
# Create Lasso model
from sklearn.linear_model import Lasso,LassoCV
ls = Lasso(alpha = 0.8)
ls.fit(X_train, Y_train)

Y_pred = ls.predict(X_test)

# testing the model

print("MAE : ",mean_absolute_error(Y_test, Y_pred))
print('R2 SCORE : ',r2_score(Y_test, Y_pred))
print('Score :',ls.score(X_test,Y_test))
print('MSE :',mean_squared_error(Y_test,Y_pred))
print('RMSE :',np.sqrt(mean_squared_error(Y_test,Y_pred)))

Hyper Parameter Tuning For alpha

In [None]:
#1. LASSOCV
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
lassocv.fit(X_train, Y_train)

ls.set_params(alpha=lassocv.alpha_)
ls.fit(X_train, Y_train)
mean_squared_error(Y_test, ls.predict(X_test))

In [None]:
# Create the Lasso model using best alpha value:

ls = Lasso(alpha = 0.0198850177087539)
ls.fit(X_train, Y_train)

Y_pred_lasso = ls.predict(X_test)

# testing the model
lasso_mae = mean_absolute_error(Y_test, Y_pred_lasso)
lasso_r2_score= r2_score(Y_test, Y_pred_lasso)
lasso_rmse = np.sqrt(mean_squared_error(Y_test,Y_pred_lasso))

print("MAE for Lasso : ",lasso_mae)
print('R2 SCORE for Lasso : ',lasso_r2_score)
print('Score for Lasso:',ls.score(X_test,Y_test))
print('MSE for Lasso :',mean_squared_error(Y_test,Y_pred_lasso))
print('RMSE for Lasso :',lasso_rmse)

In [None]:
Y_pred_lasso.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = Y_pred_lasso, y = Y_test,color ='darkorchid')

# Polynomial Regression

In [None]:


#poly converter 
polynomial_converter = PolynomialFeatures(degree=2,include_bias=False)

#convert X data and fit transform
poly_features_train = polynomial_converter.fit_transform(X_train)
poly_features_test = polynomial_converter.fit_transform(X_test)

In [None]:
#fit poly_train in elastic net 
elastic_model = ElasticNetCV(l1_ratio= 1,tol=0.01)
elastic_model.fit(poly_features_train,Y_train)

In [None]:
Y_pred_poly = elastic_model.predict(poly_features_test)

In [None]:
#Testing the model
poly_mae = mean_absolute_error(Y_test, Y_pred_poly)
poly_r2_score = r2_score(Y_test, Y_pred_poly)
poly_rmse = np.sqrt(mean_squared_error(Y_test,Y_pred_poly))
print("MAE for Polynomial: ",poly_mae)
print('R2 SCORE for Polynomial: ',poly_r2_score)
print('MSE for Polynomial :',mean_squared_error(Y_test,Y_pred_poly))
print('RMSE for Polynomial :',poly_rmse)

In [None]:
Y_pred_poly.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = Y_pred_poly,y = Y_test,color= 'coral')

# Linear Regression

In [None]:
# Create the LinearRegression model
lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,Y_train)

In [None]:
#Testing the model
test_pred_lin = lin_reg.predict(X_test)
train_pred_lin = lin_reg.predict(X_train)

linear_mae = mean_absolute_error(Y_test, test_pred_lin)
linear_r2_score= r2_score(Y_test, test_pred_lin)
linear_rmse = np.sqrt(mean_squared_error(Y_test,test_pred_lin))
print("MAE for Linear : ",linear_mae)
print('R2 SCORE for Linear : ',linear_r2_score)
print('Score for Linear:',lin_reg.score(X_test,Y_test))
print('MSE for Linear :',mean_squared_error(Y_test,test_pred_lin))
print('RMSE for Linear :',linear_rmse)

In [None]:
test_pred_lin.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = test_pred_lin, y = Y_test,color ='blue')

# Random Forest Regression

In [None]:
# Create the RandomForestRegression model
RF_reg = RandomForestRegressor(n_estimators=1000)
RF_reg.fit(X_train,Y_train)

test_pred_RF = RF_reg.predict(X_test)
train_pred_RF= RF_reg.predict(X_train)

In [None]:
#Testing the model
RF_mae = mean_absolute_error(Y_test, test_pred_RF)
RF_r2_score= r2_score(Y_test, test_pred_RF)
RF_rmse = np.sqrt(mean_squared_error(Y_test,test_pred_RF))
print("MAE for RF : ",RF_mae)
print('R2 SCORE for RF : ',RF_r2_score)
print('Score for RF:',RF_reg.score(X_test,Y_test))
print('MSE for RF :',mean_squared_error(Y_test,test_pred_RF))
print('RMSE for RF :',RF_rmse)

In [None]:
test_pred_RF.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = test_pred_RF, y = Y_test,color ='olivedrab')

# SVM Regression

In [None]:
# Create the SVM model
svm_reg = SVR(kernel='rbf', C=1000000, epsilon=0.001)
svm_reg.fit(X_train, Y_train)

test_pred_svm = svm_reg.predict(X_test)
train_pred_svm = svm_reg.predict(X_train)

In [None]:
#Testing the model
SVM_mae = mean_absolute_error(Y_test, test_pred_svm)
SVM_r2_score= r2_score(Y_test, test_pred_svm)
SVM_rmse = np.sqrt(mean_squared_error(Y_test,test_pred_svm))
print("MAE for RF : ",SVM_mae)
print('R2 SCORE for RF : ',SVM_r2_score)
print('Score for RF:',svm_reg.score(X_test,Y_test))
print('MSE for RF :',mean_squared_error(Y_test,test_pred_svm))
print('RMSE for RF :',SVM_rmse)

In [None]:
test_pred_svm.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = test_pred_svm, y = Y_test,color ='lightseagreen')

# ElasticNet Regression

In [None]:

from sklearn.linear_model import ElasticNet

# Create the ElasticNet model
enet_reg = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
enet_reg.fit(X_train, Y_train)

test_pred_enet = enet_reg.predict(X_test)
train_pred_enet = enet_reg.predict(X_train)

In [None]:
#Testing the Model
ENET_mae = mean_absolute_error(Y_test, test_pred_enet)
ENET_r2_score= r2_score(Y_test, test_pred_enet)
ENET_rmse =  np.sqrt(mean_squared_error(Y_test,test_pred_enet))
print("MAE for RF : ",ENET_mae)
print('R2 SCORE for RF : ',ENET_r2_score)
print('Score for RF:',enet_reg.score(X_test,Y_test))
print('MSE for RF :',mean_squared_error(Y_test,test_pred_enet))
print('RMSE for RF :',ENET_rmse)

In [None]:
test_pred_enet.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = test_pred_enet, y = Y_test,color ='tomato')

# SGD Regression

In [None]:
from sklearn.linear_model import SGDRegressor
## Create the SGDRegressor model
sgd_reg = SGDRegressor(n_iter_no_change=250, penalty=None, eta0=0.0001, max_iter=100000)
sgd_reg.fit(X_train, Y_train)

test_pred_sgd = sgd_reg.predict(X_test)
train_pred_sgd = sgd_reg.predict(X_train)

In [None]:
#Testing the Model
SGD_mae = mean_absolute_error(Y_test, test_pred_sgd)
SGD_r2_score= r2_score(Y_test, test_pred_sgd)
SGD_rmse = np.sqrt(mean_squared_error(Y_test,test_pred_sgd))
print("MAE for RF : ",SGD_mae)
print('R2 SCORE for RF : ',SGD_r2_score)
print('Score for RF:',sgd_reg.score(X_test,Y_test))
print('MSE for RF :',mean_squared_error(Y_test,test_pred_sgd))
print('RMSE for RF :',SGD_rmse)

In [None]:
test_pred_sgd.min()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x = test_pred_sgd, y = Y_test,color ='yellow')

# Comparing All Regressions

In [None]:
models = pd.DataFrame({
    'Regression Model': ['Ridge','Lasso','Polynomial','Linear','SVM','RandomForest','ElasticNet','SGD'],
    'MAE Score': [
        ridge_mae, 
        lasso_mae,
        poly_mae,
        linear_mae,
        SVM_mae,
        RF_mae,
        ENET_mae,
        SGD_mae],
    'R2 Score': [
        ridge_r2_score, 
        lasso_r2_score,
        poly_r2_score,   
        linear_r2_score,
        SVM_r2_score,
        RF_r2_score,
        ENET_r2_score,
        SGD_r2_score
        ],
    'RMSE': [
        ridge_rmse, 
        lasso_rmse,
        poly_rmse,   
        linear_rmse,
        SVM_rmse,
        RF_rmse,
        ENET_rmse,
        SGD_rmse
        ]
})
print("-----------MODEL EVALUATION-----------")
models.sort_values(by='MAE Score', ascending=True)

In [None]:
models.sort_values(by='RMSE', ascending=True)

In [None]:
models.set_index('Regression Model',inplace=True)
models['R2 Score'].plot(kind='barh', figsize=(10, 6))

* From Above Data we can infer that RandomForestRegression() is working best so we will consider it for final prediction.

# Applying Model On Test Data

In [None]:
Model = RandomForestRegressor()
Model.fit(X,Y)
Prediction = Model.predict(test_map)
Prediction

In [None]:
sample = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample['SalePrice'] = Prediction

In [None]:
sample

In [None]:
sample.to_csv('Submission.csv',index=False)