# Hi! :)
This Notebook was created by a beginner in ML, inspired by the codes presented here that were created by more proficient programmers:)

**Ok, let's start!**

In [None]:
#Import all necessery libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

# Statistics
from scipy.stats import norm
from scipy import stats


# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

#ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

In [None]:
# read dataset
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')


In [None]:
# Description File
with open('../input/house-prices-advanced-regression-techniques/data_description.txt', encoding='utf8') as f:
    for line in f:
        print(line.strip())

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
# How many unique variables does each column contain?
pd.options.display.max_rows = 100
train.nunique().sort_values(ascending = False)

* As you see, the **target variable 'SalePrice' is in the last column**.
* In general there are **81 features** and **1460 samples**.
* There a **3 types of variables**: *float64(3), int64(35), object(43)* in the dataset.


Now look at **missing values** in each column
* Let's work with **NaNs** in the dataset.
* We have to get rid of columns with **more than 40%** of missing values.
* If a column contain less than 40% of missing values, let's fill missing cells with mean values.


In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(train.isnull(), cbar = False, cmap="gray")

In [None]:
missing_val= pd.DataFrame(train.isnull().sum()[train.isnull().sum()!=0]\
                          .sort_values(ascending = False)).rename(columns = {0:'num_miss'})
missing_val['missing_perc'] = (missing_val/train.shape[0]*100).round(1)
missing_val = missing_val.query('missing_perc > 40')

In [None]:
missing_val

In [None]:
drop_cols = missing_val.index.to_list()
drop_cols

Ok, we should drop columns **['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']**

Besides, the column **ID** is meaningless for our model. We'll drop it too.

In [None]:
train.drop(['Id'],axis=1,inplace=True)
train.drop(columns=drop_cols,axis=1,inplace=True)

In [None]:
train.shape

* Next let's explore features more precisely. We need to divide the dataset according to the types of variables into two groups: **numerical features** and **categorical features**. We will explore them *separately*.

In [None]:
num_cols = train.select_dtypes(include=['number'])
cat_cols = train.select_dtypes(include=['object'])

print(f'The dataset contains {len(num_cols.columns.tolist())} numerical columns \
and {len(cat_cols.columns.tolist())} categorical columns')

In [None]:
num_cols.head()

In [None]:
num_cols.describe()

In [None]:
cat_cols.head()

In [None]:
cat_cols.describe()

# Correlation

For our model we will choose only those variables, which correlation to the target variable is **more than 0.35**

In [None]:
num_corr_price = num_cols.corr()['SalePrice'][:-1]

In [None]:
#correlation with the target variable
num_corr_price

In [None]:
best_features = num_corr_price[abs(num_corr_price) > 0.35].sort_values(ascending=False)
print("There are {} strongly correlated numerical features with SalePrice:\n{}"\
      .format(len(best_features), best_features))

In [None]:
for feature in best_features.index:
    num_corr_price.drop(feature,inplace = True)

In [None]:
for feature in num_corr_price.index:
    train.drop(feature,axis = 1,inplace = True)
    num_cols.drop(feature,axis = 1,inplace = True)

In [None]:
train.shape

Besides, we should get rid of correlated variables because they may worsen the output of the model. We will get rid of one of the correlated variables  in a pair where correlation > 0.80

In [None]:
num_corr = num_cols.corr()
corr_triu = num_corr.where(np.triu(np.ones(num_corr.shape), k=1).astype(np.bool))

plt.figure(figsize=(10,10))
sns.heatmap(num_corr,annot=True, square=True, fmt='.2f',\
            annot_kws={'size':9}, mask = np.triu(corr_triu), cmap= "coolwarm")

In [None]:
corr_triu_collinear = corr_triu.iloc[:-1,:-1]

In [None]:
collinear_features = [column for column in corr_triu_collinear.columns if any(corr_triu_collinear[column] > 0.60)]
train.drop(columns = collinear_features,inplace=True)
num_cols.drop(columns = collinear_features,inplace=True)

In [None]:
train.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(num_cols.corr(),annot=True, square=True, fmt='.2f',\
            annot_kws={'size':9}, mask = np.triu(num_cols.corr()), cmap= "coolwarm")

In [None]:
num_cols.isna().sum()

We should fill the gaps in "LotFrontage"

In [None]:
num_cols['LotFrontage'].hist(bins = 40)

In [None]:
num_cols['LotFrontage'].describe()

So, 
* 25%        59.000000
* 75%        80.000000

Therefore, we will fill the gaps with random int between 59 and 80

In [None]:
train['LotFrontage'].fillna(np.random.randint(59,80), inplace = True)
train['LotFrontage'].isna().sum()

In [None]:
#MasVnrArea: Masonry veneer area in square feet --> let's fill missing values with median (0)
num_cols.MasVnrArea.hist(bins = 50)

In [None]:
num_cols.MasVnrArea.fillna(0, inplace = True)

In [None]:
print('Number of features left in numerical features:',len(num_cols.columns))
print('Numerical Features left:')
print(num_cols.columns.values)

Outliers may influence the output of the model. We should vanish them.

In [None]:
for i in range(0, len(num_cols.columns), 5):
    plt.figure(figsize=(15,15))
    sns.pairplot(data=num_cols, x_vars=num_cols.columns[i:i+5], y_vars=['SalePrice'])

In [None]:
train = train.drop(train.LotFrontage.sort_values(ascending = False)[:2].index) 
train = train.drop(train.BsmtFinSF1.sort_values(ascending = False)[:1].index)
train = train.drop(train.MasVnrArea.sort_values(ascending = False)[:1].index)
train = train.drop(train.TotalBsmtSF.sort_values(ascending = False)[:1].index)
train = train.drop(train.GrLivArea.sort_values(ascending = False)[:2].index)

In [None]:
train.reset_index(drop=True,inplace=True)

In [None]:
train.shape

How is the target variable distributed?

In [None]:
# plt.title(f'Untransformed SalePrice, Skew: {stats.skew(train.SalePrice):.3f}')
# sns.distplot(train.SalePrice,fit=norm)
# plt.axvline(train.SalePrice.mode().to_numpy(), linestyle='--', color='green', label='mode')
# plt.axvline(train.SalePrice.median(), linestyle='--', color='blue', label='median')
# plt.axvline(train.SalePrice.mean(), linestyle='--', color='red', label='mean')
# plt.grid(alpha = 0.3)
# plt.legend()

In [None]:
# train.SalePrice.describe()

* The target variable SalePrice is **right-skewed**.
* The **mean is biased towards a higher price than the median**.
* Therefore, we have to transform the target variable by using **log function**.

In [None]:
# train['SalePrice'] = np.log(train['SalePrice'])

In [None]:
# plt.title(f'Transformed SalePrice, Skew: {stats.skew(train.SalePrice):.3f}')
# sns.distplot(train.SalePrice,fit=norm)
# plt.axvline(train.SalePrice.mode().to_numpy(), linestyle='--', color='green', label='mode')
# plt.axvline(train.SalePrice.median(), linestyle='--', color='blue', label='median')
# plt.axvline(train.SalePrice.mean(), linestyle='--', color='red', label='mean')
# plt.grid(alpha = 0.3)
# plt.legend()

Much better!
Now, let's turn to categorical features.

In [None]:
cat_cols_missing = cat_cols.columns[cat_cols.isnull().any()]
cat_cols_missing

In [None]:
imputer = SimpleImputer(missing_values = np.NaN,strategy = 'most_frequent')
for feature in cat_cols_missing:
     cat_cols[feature] = imputer.fit_transform(cat_cols[feature].values.reshape(-1,1))
     train[feature] = imputer.fit_transform(train[feature].values.reshape(-1,1))

In [None]:
cat_cols.nunique().sort_values(ascending = False)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feature in cat_cols.columns:
    cat_cols[feature]=le.fit_transform(cat_cols[feature])
    train[feature]=le.fit_transform(train[feature])

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(cat_cols.corr(), square=True, mask = np.triu(cat_cols.corr()), cmap= "coolwarm")

In [None]:
cat_corr = cat_cols.corr()
cat_corr_triu = cat_corr.where(np.triu(np.ones(cat_corr.shape), k=1).astype(np.bool))

cat_collinear_features = [column for column in cat_corr_triu.columns if any(cat_corr_triu[column] > 0.60)]
train.drop(columns = cat_collinear_features,inplace=True)
cat_cols.drop(columns = cat_collinear_features,inplace=True)

In [None]:
train.head()

In [None]:
train.replace([np.inf, -np.inf], np.nan)
train.isna().sum().sort_values(ascending = False)

In [None]:
train.MasVnrArea.describe()

In [None]:
train.MasVnrArea.fillna(0, inplace = True)

# Linear Regression

In [None]:
y = train['SalePrice']
X = train.iloc[:,:-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
print("Training set score: {:.2f}".format(linreg.score(X_train, y_train))) 
print("Test set score: {:.2f}".format(linreg.score(X_test, y_test)))

In [None]:
# print the intercept
print(linreg.intercept_)

In [None]:
coeff_df = pd.DataFrame(linreg.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
pred = linreg.predict(X_test)

In [None]:
!pip install hvplot
import hvplot.pandas

pd.DataFrame({'True Values': y_test, 'Predicted Values': pred}).hvplot.scatter(x='True Values', y='Predicted Values')

In [None]:
pd.DataFrame({'Error Values': (y_test - pred)}).hvplot.kde()

In [None]:
from sklearn import metrics

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')

In [None]:
test_pred = linreg.predict(X_test)
train_pred = linreg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
#Robust Regression

from sklearn.linear_model import RANSACRegressor

model = RANSACRegressor(base_estimator=LinearRegression(), max_trials=100)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
#Elastic Net

from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
# Stochastic Gradient Descent

from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(n_iter_no_change=250, penalty=None, eta0=0.0001, max_iter=100000)
sgd_reg.fit(X_train, y_train)

test_pred = sgd_reg.predict(X_test)
train_pred = sgd_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
# Artficial Neural Network

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()

model.add(Dense(X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))

model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.2))

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1))

model.compile(optimizer=Adam(0.00001), loss='mse')

r = model.fit(X_train, y_train,
              validation_data=(X_test,y_test),
              batch_size=1,
              epochs=100)

In [None]:
pd.DataFrame(r.history).hvplot.line(y=['loss', 'val_loss'])

In [None]:
test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)

print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators=1000)
rf_reg.fit(X_train, y_train)

test_pred = rf_reg.predict(X_test)
train_pred = rf_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)

print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

# Getting submissions

In [None]:
test.shape

In [None]:
train.shape

In [None]:
for col in test.columns:
    if col not in train.columns:
        test.drop(col, axis = 1, inplace = True)

In [None]:
test.shape

In [None]:
test.isna().sum().sort_values(ascending = False)

In [None]:
num_cols_test = test.select_dtypes(include=['number'])
cat_cols_test = test.select_dtypes(include=['object'])

In [None]:
num_cols_test.isna().sum()

In [None]:
num_cols_test['LotFrontage'].describe()

In [None]:
test['LotFrontage'].fillna(np.random.randint(58,80), inplace = True)
test['LotFrontage'].isna().sum()

In [None]:
test['MasVnrArea'].fillna(test.MasVnrArea.median(), inplace = True)
test['BsmtFinSF1'].fillna(test.BsmtFinSF1.median(), inplace = True)
test['TotalBsmtSF'].fillna(test.TotalBsmtSF.median(), inplace = True)

In [None]:
cat_cols_missing_test = cat_cols_test.columns[cat_cols_test.isnull().any()]

In [None]:
imputer = SimpleImputer(missing_values = np.NaN,strategy = 'most_frequent')
for feature in cat_cols_missing_test:
     cat_cols_test[feature] = imputer.fit_transform(cat_cols_test[feature].values.reshape(-1,1))
     test[feature] = imputer.fit_transform(test[feature].values.reshape(-1,1))

In [None]:
for feature in cat_cols_test.columns:
    cat_cols_test[feature]=le.fit_transform(cat_cols_test[feature])
    test[feature]=le.fit_transform(test[feature])

In [None]:
test.isna().sum().sort_values()

In [None]:
pred_y = linreg.predict(test)

In [None]:
pred_y

In [None]:
sample = pd.DataFrame()

In [None]:
sample['Id'] = range(1461,2920)

In [None]:
sample['SalePrice'] = pred_y

In [None]:
sample

In [None]:
sample.to_csv('my_subs.csv')

# Please, if you have any questions or remarks, comment this code!:) I'll be glad to answer your questions!)