In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.feature_selection import chi2
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from category_encoders import TargetEncoder
import scipy.stats
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_df.info()

In [None]:
train_df.describe(exclude = ['int', 'float'])

In [None]:
train_df.describe(exclude = ['object'])

In [None]:
plt.figure(figsize = (20, 6))
missings = train_df.isnull().sum() / len(train_df)
missings.plot.bar()
plt.axhline(0.5, color = 'r')
plt.show()

<p> More than 80% of the data is missing in Alley, PoolQC, Fence, MiscFeature</p>

<ul>
    <li> Alley column has three features Grave, Pval and NA(No alley access), so we can consider that all of the missing values are NA</li>
    <li> also in PoolQC there is a category NA so all of the missings are NA</li>
    <li> Fence all of the missings are NA also</li>
    <li> NA also all of the missings in MiscFeature </li>
</ul>

<p> other missing values in columns below 50% </p>
<ul>
    <li> Lot Frontage (numerical Column)</li>
    <li> MasVnrType (Categorical Column) replace missings with (None)</li>
    <li> MasVnrType (numerical Column)</li>
    <li> BsmtQual (categorical Column) NA</li>
    <li> BsmtCond (categorical Column) NA</li>
    <li> BsmtExposure (categorical Column) NA</li>
    <li> BsmtFinType1 (categorical Column) NA</li>
    <li> BsmtFinType2 (categorical Column) NA</li>
    <li> FireplaceQu (categorical Column) NA</li>
    <li> GarageType (categorical Column) NA</li>
    <li> GarageFinish (categorical Column) NA</li>
    <li> GarageQual (categorical Column) NA</li>
    <li> GarageCond (categorical Column) NA</li>
</ul>

In [None]:
# let's first split our data into Categorical features and numberical ones
# and start explore thier behavoiur with the target variable
numerical_features = [col for col in train_df.columns if train_df[col].dtype != 'object']
categorical_features = [col for col in train_df.columns if train_df[col].dtype == 'object']

In [None]:
# let's check the normality of SalesPrice (shapiro-wilk test)
from scipy.stats import shapiro
def check_normality(data):
    stat, p = shapiro(data)
    print("stat = %.2f, P-Value = %.2f" % (stat, p))
    if p > 0.05:
        print("Normal Distribution")
    else:
        print("Not Normal.")
check_normality(train_df["SalePrice"])

In [None]:
# so SalePrice doesn't follow normal Distribution
sns.distplot(train_df['SalePrice'])
plt.show()

In [None]:
# I think it is so obovious that SalePrice is positively skewed.
# let's apply log transformation and see
sns.distplot(np.log1p(train_df['SalePrice']))
plt.show()

In [None]:
# also let's check the normality for each numerical Variable..
for col in train_df[numerical_features].columns:
    print(f"shapiro-wilk test for {col}")
    check_normality(train_df[col])
    print("=============================")

In [None]:
plt.figure(figsize = (25, 25))
sns.heatmap(train_df[numerical_features].corr(), annot = True)
plt.show()

In [None]:
# now also let's check correlation between features and Target variables..
target_corr = train_df[numerical_features].corr()['SalePrice'].sort_values(ascending = False)
target_corr

In [None]:
# now let's remove some correlated and transform some numerical fatures....
# let's remove Id, GarageYrBlt, GarageArea, 1stFlrSF
train_df.drop(['Id', 'GarageYrBlt', 'GarageArea', '1stFlrSF'], axis = 1, inplace = True)
test_df.drop(['Id', 'GarageYrBlt', 'GarageArea', '1stFlrSF'], axis = 1, inplace = True)

In [None]:
cat_target = pd.melt(train_df, id_vars = 'SalePrice', value_vars = categorical_features)
g = sns.FacetGrid(cat_target, col='variable',  col_wrap=2, sharex=False, sharey=False, size=5, palette = 'tab10')
g = g.map(sns.boxplot, 'value', 'SalePrice')

<p> Most of the the categorical features distribution varies with the Sale Price of the houses except for Utilites that contain no variability (1459 AllPub, 1 NoSaWa)

In [None]:
test_df['Utilities'].value_counts().plot.bar()
plt.show()

<p> again test data contain one category of utilities, so we can remove it</p>

In [None]:
train_df.drop('Utilities', axis = 1, inplace = True)
test_df.drop('Utilities', axis = 1, inplace = True)

In [None]:
# now let's handle missing categorical variables with 'None'
categorical_features.remove('Utilities')
train_df[categorical_features] = train_df[categorical_features].fillna("NoNe")
test_df[categorical_features] = test_df[categorical_features].fillna("NoNe")

In [None]:
# let's now handle the numerical columns with median if it's data outside the range of 0.05 and 0.95
# mean if it in the range of 0.05 and 0.95
# LotFrontage, MasVnrArea
def outlier_detector(data, col_name):
    upper_lim = data[col_name].quantile(.95)
    lower_lim = data[col_name].quantile(.05)
    data = data[(data[col_name] < lower_lim) & (data[col_name] > upper_lim)][col_name]
    if len(data) > 0:
        return True
    return False
def handle_numerical(data, col_name):
    if outlier_detector(data, col_name):
        data[col_name].fillna(data[col_name].mean(), inplace = True)
    else:
        data[col_name].fillna(data[col_name].median(), inplace = True)
handle_numerical(train_df, 'LotFrontage')
handle_numerical(train_df, 'MasVnrArea')
handle_numerical(test_df, 'LotFrontage')
handle_numerical(test_df, 'MasVnrArea')

In [None]:
# generete some features from the old ones..
train_df['AboveGr'] = (train_df['FullBath'] + train_df['BedroomAbvGr'] + train_df['KitchenAbvGr'] + train_df['HalfBath']) / train_df['GrLivArea']
test_df['AboveGr'] = (test_df['FullBath'] + test_df['BedroomAbvGr'] + test_df['KitchenAbvGr'] + test_df['HalfBath']) / test_df['GrLivArea']
train_df["TotalBath"] = train_df["BsmtFullBath"] + (0.5 * train_df["BsmtHalfBath"]) + train_df["FullBath"] + (0.5 * train_df["HalfBath"])
test_df["TotalBath"] = test_df["BsmtFullBath"] + (0.5 * test_df["BsmtHalfBath"]) + test_df["FullBath"] + (0.5 * test_df["HalfBath"])
train_df["AllSF"] = train_df["GrLivArea"] + train_df["TotalBsmtSF"]
test_df["AllSF"] = test_df["GrLivArea"] + test_df["TotalBsmtSF"]
# add some ploynomials for the uncorrelated features ^2, ^1/2, ^3....
train_df['PoolArea^2'] = train_df['PoolArea']**2
train_df['PoolArea^3'] = train_df['PoolArea']**3
train_df['PoolArea^1/2'] = np.sqrt(train_df['PoolArea'])
test_df['PoolArea^2'] = test_df['PoolArea']**2
test_df['PoolArea^3'] = test_df['PoolArea']**3
test_df['PoolArea^1/2'] = np.sqrt(test_df['PoolArea'])
#########
train_df['MoSold^2'] = train_df['MoSold']**2
train_df['MoSold^3'] = train_df['MoSold']**3
train_df['MoSold^1/2'] = np.sqrt(train_df['MoSold'])
test_df['MoSold^2'] = test_df['MoSold']**2
test_df['MoSold^3'] = test_df['MoSold']**3
test_df['MoSold^1/2'] = np.sqrt(test_df['MoSold'])
#########
train_df['3SsnPorch^2'] = train_df['3SsnPorch']**2
train_df['3SsnPorch^3'] = train_df['3SsnPorch']**3
train_df['3SsnPorch^1/2'] = np.sqrt(train_df['3SsnPorch'])
test_df['3SsnPorch^2'] = test_df['3SsnPorch']**2
test_df['3SsnPorch^3'] = test_df['3SsnPorch']**3
test_df['3SsnPorch^1/2'] = np.sqrt(test_df['3SsnPorch'])
#########
train_df['BsmtFinSF2^2'] = train_df['BsmtFinSF2']**2
train_df['BsmtFinSF2^3'] = train_df['BsmtFinSF2']**3
train_df['BsmtFinSF2^1/2'] = np.sqrt(train_df['BsmtFinSF2'])
test_df['BsmtFinSF2^2'] = test_df['BsmtFinSF2']**2
test_df['BsmtFinSF2^3'] = test_df['BsmtFinSF2']**3
test_df['BsmtFinSF2^1/2'] = np.sqrt(test_df['BsmtFinSF2'])
#########
train_df['BsmtHalfBath^2'] = train_df['BsmtHalfBath']**2
train_df['BsmtHalfBath^3'] = train_df['BsmtHalfBath']**3
train_df['BsmtHalfBath^1/2'] = np.sqrt(train_df['BsmtHalfBath'])
test_df['BsmtHalfBath^2'] = test_df['BsmtHalfBath']**2
test_df['BsmtHalfBath^3'] = test_df['BsmtHalfBath']**3
test_df['BsmtHalfBath^1/2'] = np.sqrt(test_df['BsmtHalfBath'])
#########
train_df['MiscVal^2'] = train_df['MiscVal']**2
train_df['MiscVal^3'] = train_df['MiscVal']**3
train_df['MiscVal^1/2'] = np.sqrt(train_df['MiscVal'])
test_df['MiscVal^2'] = test_df['MiscVal']**2
test_df['MiscVal^3'] = test_df['MiscVal']**3
test_df['MiscVal^1/2'] = np.sqrt(test_df['MiscVal'])
#########
train_df['LowQualFinSF^2'] = train_df['LowQualFinSF']**2
train_df['LowQualFinSF^3'] = train_df['LowQualFinSF']**3
train_df['LowQualFinSF^1/2'] = np.sqrt(train_df['LowQualFinSF'])
test_df['LowQualFinSF^2'] = test_df['LowQualFinSF']**2
test_df['LowQualFinSF^3'] = test_df['LowQualFinSF']**3
test_df['LowQualFinSF^1/2'] = np.sqrt(test_df['LowQualFinSF'])

In [None]:
def house_remodel(df):
    lst = []
    for val in df['YearRemodAdd'] - df['YearBuilt']:
        if val > 0:
            lst.append('Yes')
        else:
            lst.append('No')
    return lst
train_df['HouseRemodeled'] = house_remodel(train_df)
test_df['HouseRemodeled'] = house_remodel(test_df)

In [None]:
# let's check the skewness of the numeric features to see which features needs log transformation
numerical_features = [col for col in train_df.columns if train_df[col].dtype != 'object']
numerical_features.remove('SalePrice')
categorical_features = [col for col in train_df.columns if train_df[col].dtype == 'object']
skewed_features = [col for col in train_df[numerical_features].columns if abs(train_df[col].skew()) > 0.5]
print(len(skewed_features))

In [None]:
for col in test_df[numerical_features].columns:
    handle_numerical(test_df, col)

In [None]:
y = train_df['SalePrice']
X = train_df.drop('SalePrice', axis = 1)
y = y.apply(lambda x: np.log1p(x))
X[skewed_features] = X[skewed_features].apply(lambda x: np.log1p(x))
test_df[skewed_features] = test_df[skewed_features].apply(lambda x: np.log1p(x))

In [None]:
all_num_data = pd.concat([X[numerical_features], test_df[numerical_features]])
ss=StandardScaler()
ss.fit(all_num_data)
normalized_X=pd.DataFrame(ss.transform(X[numerical_features]))
normalized_test=pd.DataFrame(ss.transform(test_df[numerical_features]))
normalized_X.index = X.index
normalized_test.index = test_df.index

In [None]:
# check high and low cardinality columns
good_label_cols=[i for i in categorical_features if set(X[i])==set(test_df[i])]
bad_label_cols = list(set(categorical_features)-set(good_label_cols))
print('good label cols \n', good_label_cols)
print('bad label cols \n', bad_label_cols)

In [None]:
good_cat_x=X[good_label_cols]
bad_cat_x=X[bad_label_cols]
good_cat_test=test_df[good_label_cols]
bad_cat_test=test_df[bad_label_cols]

In [None]:
all_cat_data = pd.concat([X[categorical_features], test_df[categorical_features]])

In [None]:
ohe=OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.fit(all_cat_data)
oh_cat_x=pd.DataFrame(ohe.transform(train_df[categorical_features]))
oh_cat_test=pd.DataFrame(ohe.transform(test_df[categorical_features]))
oh_cat_x.index = bad_cat_x.index
oh_cat_test.index = bad_cat_test.index

In [None]:
X=pd.concat([normalized_X, oh_cat_x], axis=1)
test_df=pd.concat([normalized_test, oh_cat_test], axis=1)

In [None]:
model=[
    {
        'name': 'linear regression',
        'estimator':LinearRegression(),
        'hyperparameters':{}
    },
    {
        'name':'ridge regression',
        'estimator':Ridge(),
        'hyperparameters':{
            'alpha':np.arange(0.01, 1, 0.02)
        }
    },
    {
        'name':'lasso regression',
        'estimator':Lasso(),
        'hyperparameters':{
            'alpha':np.arange(0.01, 1, 0.02)
        }
    },
    {
        'name': 'ElasticNet',
        'estimator': ElasticNet(),
        'hyperparameters':{
            'alpha':np.arange(0.01, 1, 0.02),
            'l1_ratio': [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1] 
        }
    },
    {
        'name':'decision Tree',
        'estimator':DecisionTreeRegressor(),
        'hyperparameters':{
            'max_depth':[2,3,4,5,6,7],
            'criterion':['mse', 'friedman_mse', 'mae'],
            'splitter':['best', 'random'],
            'max_features':['auto', 'sqrt', 'log2']
        }
    }
]
for i in model:
    print(i['name'])
    gs=GridSearchCV(i['estimator'], param_grid=i['hyperparameters'], cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
    gs.fit(X.values, y.values)
    print('best score: ', gs.best_score_)
    print('best parameters ; ', gs.best_params_)
    print('best model: ', gs.best_estimator_)
    print('---------------------------------\n')

In [None]:
final_model = ElasticNet(alpha=0.01, l1_ratio=0.1)
final_model.fit(X, y)

In [None]:
final_model2 = LinearRegression()
final_model2.fit(X, y)

In [None]:
final_model3 = VotingRegressor([('en', ElasticNet(alpha=0.01, l1_ratio=0.1)),
                                ('xgb', XGBRegressor(n_estimators = 1000, learning_rate = 0.05))])
score = cross_validate(final_model3, X.values, y.values, cv = 5, scoring = ['neg_root_mean_squared_error'])
score['test_neg_root_mean_squared_error'].mean()

In [None]:
final_model3.fit(X.values, y.values)
y_hat = np.expm1(final_model3.predict(test_df))
y_hat

In [None]:
submission=pd.DataFrame({'Id':range(1461, 1461+len(test_df)),'SalePrice':y_hat})

In [None]:
submission.to_csv('submission.csv',index=False)