In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import Ridge, LinearRegression, SGDRegressor, Lasso, ElasticNet
from xgboost import XGBRegressor

# To Avoid unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

# Data preparation and preprocessing

In [None]:
#Loading data

test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train_data.head()

In [None]:
train_data = train_data.drop(['Id'] , axis =1)
test_data = test_data.drop(['Id'] , axis =1)

In [None]:
# Checking the number of rows and columns

print('Number of rows:', train_data.shape[0])
print('Number of columns:', train_data.shape[1])

In [None]:
train_data.info()

In [None]:
#Extracting numerical data
train_data = train_data.select_dtypes(include=['int64', 'float64'])
test_data =  test_data.select_dtypes(include=['int64', 'float64'])

In [None]:
train_data.head()

In [None]:
# #Exploratory data analysis to check for class imbalance
# train_data.hist(figsize =(30,20), color = 'green', alpha = 0.5);

In [None]:
#Looking for missing data
train_data.isnull().sum()

In [None]:
#Looking for missing data
test_data.isnull().sum()

### Replacing null values.

In [None]:
#Filling null values with median
test_data.fillna(test_data.median(), inplace = True)
test_data.isnull().sum()

In [None]:
#Filling null values with median
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(train_data['LotFrontage'].median())
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(train_data['GarageYrBlt'].median())
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(train_data['MasVnrArea'].median())

Both cells serve the same purpose - filling null values - but they are just written in different ways.

In [None]:
#Looking for duplicates
sum(train_data.duplicated())

In [None]:
#Splitting
x = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]

## Skewness

In [None]:
#Checking for skewness
skewed_features = [col for col in train_data.columns if train_data[col].skew() > 0.5]
print(len(skewed_features))

In [None]:
train_data[skewed_features] = train_data[skewed_features].apply(lambda x: np.log1p(x))

Taking the log makes the data **less** skewed

In [None]:
skewed_features.remove('SalePrice')

In [None]:
test_data[skewed_features] = test_data[skewed_features].apply(lambda x: np.log1p(x))

In [None]:
# from sklearn.model_selection import train_test_split
# x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
#Splitting the dataset into the Training set and Test set
x = train_data.drop('SalePrice', axis = 1)
y = train_data['SalePrice']

## Scaling

In [None]:
#Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

Many machine learning algorithms as Linear Regression performs better when numerical input variables are scaled to a standard range.

In [None]:
test_data = sc.transform(test_data)

# Linear Regression 

In [None]:
#cross validation
lr = LinearRegression()
cv_score = cross_validate(lr, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

In [None]:
#Checking data skewness
sns.distplot(y)

Distribution shows positive skewness

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)

In [None]:
model.score(x, y)

In [None]:
y_pred = model.predict(x)
y_pred

In [None]:
r2_score(y, y_pred)

In [None]:
mean_squared_error(y,y_pred, squared=False)

In [None]:
valid_pred = model.predict(x)

In [None]:
r2_score(y,valid_pred)

# SGDRegressor

In [None]:
#cross validation
lr = SGDRegressor()
cv_score = cross_validate(lr, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

# Ridge

In [None]:
#cross validation
lr = Ridge()
cv_score = cross_validate(lr, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

# Lasso before regularization

In [None]:
lr = Lasso()
cv_score = cross_validate(lr, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

# Lasso after regularization

In [None]:
#Lasso after regularization
lr = Lasso(alpha = 0.001)
cv_score = cross_validate(lr, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score


In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

# ElasticNet

In [None]:
#cross validation
lr = ElasticNet()
cv_score = cross_validate(lr, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

# Kernel SVM (Linear, Poly, RBF)

In [None]:
from sklearn.svm import SVR

In [None]:
#Linear kernel
svr_linear_model = SVR(kernel = 'linear' , C =1)

cv_score = cross_validate(svr_linear_model, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

In [None]:
#poly
svr_linear_model = SVR(kernel = 'poly' , C =1)

cv_score = cross_validate(svr_linear_model, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

In [None]:
#rbf
svr_linear_model = SVR(kernel = 'rbf' , C =1)

cv_score = cross_validate(svr_linear_model, x, y,
                          cv = 10,
                          scoring = ['neg_root_mean_squared_error', 'neg_mean_squared_error'])
cv_score

In [None]:
 cv_score['test_neg_root_mean_squared_error'].mean()

In [None]:
svr_linear_model = SVR(kernel = 'rbf' , C =1)
svr_linear_model.fit(x, y)

## Eventually, fitting the data to the chosen ML model 'lasso' to get final results

In [None]:
LR = Lasso(alpha = 0.001)
LR.fit(x, y)

In [None]:
y_pred = np.expm1(svr_linear_model.predict(test_data))
y_pred

In [None]:
submission = pd.DataFrame({'Id': range(1461, 1461 + len(test_data)), 'SalePrice': y_pred})

In [None]:
submission.to_csv('submission.csv', index = False)