In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.feature_selection import chi2
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import scipy.stats
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_df.info()

In [None]:
train_df.describe(exclude = ['int', 'float'])

In [None]:
train_df.describe(exclude = 'object')

In [None]:
# let's first show how many values contain missing values that exceeds 50%
missings = train_df.isnull().sum() / len(train_df)
missings[missings > 0.5]

In [None]:
# so first thing let's remove ['Alley', 'PoolQC', 'Fence', 'MiscFeature'] missing values greater than 80% which is not useful
train_df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis = 1, inplace = True)
test_df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis = 1, inplace = True)

In [None]:
# let's first split our data into Categorical features and numberical ones
# and start explore thier behavoiur with the target variable
numerical_features = [col for col in train_df.columns if train_df[col].dtype != 'object']
categorical_features = [col for col in train_df.columns if train_df[col].dtype == 'object']

In [None]:
# let's check the normality of SalesPrice (shapiro-wilk test)
from scipy.stats import shapiro
def check_normality(data):
    stat, p = shapiro(data)
    print("stat = %.2f, P-Value = %.2f" % (stat, p))
    if p > 0.05:
        print("Normal Distribution")
    else:
        print("Not Normal.")
check_normality(train_df["SalePrice"])

In [None]:
# so SalePrice doesn't follow normal Distribution
sns.distplot(train_df['SalePrice'])
plt.show()

In [None]:
# I think it is so obovious that SalePrice is positively skewed.
# let's apply log transformation and see
sns.distplot(np.log1p(train_df['SalePrice']))
plt.show()

In [None]:
# also let's check the normality for each numerical Variable..
for col in train_df[numerical_features].columns:
    print(f"shapiro-wilk test for {col}")
    check_normality(train_df[col])
    print("=============================")

In [None]:
# also all of numerical Features has normal distribution, must be transformed as well.
# now also let's check correlation between features and Target variables..
train_df[numerical_features].corr()['SalePrice'].sort_values(ascending = False)

In [None]:
plt.figure(figsize = (25, 25))
sns.heatmap(train_df[numerical_features].corr(), annot = True)
plt.show()

In [None]:
# now let's remove some correlated and transform some numerical fatures....
# let's remove Id, GarageYrBlt, GarageArea, 1stFlrSF
for val in ['Id', 'GarageYrBlt', 'GarageArea', '1stFlrSF']:
    numerical_features.remove(val)

In [None]:
y = train_df['SalePrice']
X = train_df[numerical_features].drop('SalePrice', axis = 1)

In [None]:
X['OverallQual^2'] = X['OverallQual']**2
X['OverallQual^3'] = X['OverallQual']**3
X['OverallQual^1/2'] = np.sqrt(X['OverallQual'])

In [None]:
# let's check the skewness of the numeric features to see which features needs log transformation
skewed_features = [col for col in X.columns if X[col].skew() > 0.5]
print(len(skewed_features))

In [None]:
y = y.apply(lambda x: np.log1p(x))
X[skewed_features] = X[skewed_features].apply(lambda x: np.log1p(x))

In [None]:
X.fillna(X.median(), inplace = True)

In [None]:
test_numerical = [col for col in test_df.columns if (test_df[col].dtype != 'object') and (col not in ['Id', 'GarageYrBlt', 'GarageArea', '1stFlrSF'])]
X_test = test_df[test_numerical]
all_features = pd.concat([X, X_test])

In [None]:
scaler = StandardScaler()
scaler.fit(all_features)
X = scaler.transform(X)

In [None]:
# let's now apply cross validation on linear Regression
def LinearRegCV(features, target):
    lr = LinearRegression()
    cv_score = cross_validate(lr,
                               features,
                               target,
                               cv = 5,
                               scoring = ['neg_mean_absolute_error',
                                          'neg_mean_squared_error',
                                          'neg_root_mean_squared_error',
                                          'r2']) 
    return cv_score

In [None]:
LR_scores = LinearRegCV(X, y)
LR_scores['test_neg_root_mean_squared_error'].mean()

In [None]:
print(np.mean(LR_scores['test_neg_mean_absolute_error']))
print(np.mean(LR_scores['test_neg_mean_squared_error']))
print(np.mean(LR_scores['test_neg_root_mean_squared_error']))

In [None]:
X_test['OverallQual^2'] = X_test['OverallQual']**2
X_test['OverallQual^3'] = X_test['OverallQual']**3
X_test['OverallQual^1/2'] = np.sqrt(X_test['OverallQual'])
X_test[skewed_features] = X_test[skewed_features].apply(lambda x: np.log1p(x))
X_test.fillna(X_test.median(), inplace = True)
X_test = scaler.transform(X_test)

In [None]:
lr = LinearRegression()
lr.fit(X, y)
y_hat = np.expm1(lr.predict(X_test))
y_hat

In [None]:
ridgeCV = RidgeCV(alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75],
                  cv = 5,
                  scoring = 'neg_root_mean_squared_error')
ridgeCV.fit(X, y)

In [None]:
ridgeCV.alpha_

In [None]:
RidgeReg = Ridge(alpha = 0.05)
RidgeReg.fit(X, y)

In [None]:
y_hat_ridge = np.expm1(RidgeReg.predict(X_test))
y_hat_ridge

In [None]:
lassoCV = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                            0.3, 0.6, 1],
                  cv = 5,
                  max_iter = 5000)
lassoCV.fit(X, y)

In [None]:
lassoCV.alpha_

In [None]:
len(lassoCV.coef_[lassoCV.coef_>0])

In [None]:
lassoReg = Lasso(alpha = 0.0001)
lassoReg.fit(X, y)

In [None]:
y_hat_lasso = np.expm1(lassoReg.predict(X_test))
y_hat_lasso

In [None]:
def modelCV(features, target, model):
    cv_score = cross_validate(model,
                               features,
                               target,
                               cv = 5,
                               scoring = ['neg_mean_absolute_error',
                                          'neg_mean_squared_error',
                                          'neg_root_mean_squared_error',
                                          'r2']) 
    return cv_score

In [None]:
elasticnetCV = ElasticNetCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                            0.3, 0.6, 1, 3, 6],
                            l1_ratio = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1],
                            cv = 5,
                            max_iter = 5000)
elasticnetCV.fit(X, y)

In [None]:
elasticnetCV.alpha_

In [None]:
elasticnetCV.l1_ratio_

In [None]:
elasticNet = ElasticNet(alpha = 0.0006, l1_ratio = 0.8)
elasticNet.fit(X, y)
y_hat_elastic = np.expm1(elasticNet.predict(X_test))
y_hat_elastic

In [None]:
svr = SVR(kernel = 'linear')
svr_scores = modelCV(X, y, svr)
svr_scores['test_neg_root_mean_squared_error'].mean()

In [None]:
svr_poly = SVR(kernel = 'poly', degree = 2)
svr_poly_scores = modelCV(X, y, svr_poly)
svr_poly_scores['test_neg_root_mean_squared_error'].mean()

In [None]:
svr_rbf = SVR(kernel = 'rbf')
svr_rbf_scores = modelCV(X, y, svr_rbf)
svr_rbf_scores['test_neg_root_mean_squared_error'].mean()

In [None]:
DNN = keras.Sequential([
    layers.Dense(16, activation = 'relu', input_shape = (36,)),
    layers.Dropout(0.2),
    layers.BatchNormalization(),
    layers.Dense(8, activation = 'relu'),
    layers.Dropout(0.2),
    layers.BatchNormalization(),
    layers.Dense(1, activation = 'relu')
])
early_stopping = EarlyStopping(
    min_delta = 0.001,
    patience = 5,
    restore_best_weights = True
)
DNN.compile(
    optimizer = 'adam',
    loss = 'mse'
)
DNN.fit(
    X, y,
    batch_size = 80,
    epochs = 200,
    callbacks=[early_stopping]
)

In [None]:
nn_y_hat = np.exp(DNN.predict(X_test))
nn_y_hat

In [None]:
final_data = pd.concat([test_df.Id, pd.Series(y_hat)], axis = 1)
final_data.rename(columns = {0: 'SalePrice'}, inplace = True)
final_data.head()

In [None]:
final_data.to_csv('submission.csv',index=False)