# Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Importing the training dataset to do the EDA

In [None]:
train_dataset = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

## Look at the training dataset

In [None]:
train_dataset

### Now since you got the idea, its better to look at what the data contains overall

In [None]:
train_dataset.info()

- There are in total 1460 entries that means 1460 rows and 81 columns.
- We can also see that some columns are integer, some are float and some are object that means categories

#### The objective of the assignment is to find the SalePrice for a dataset.
- Its better to check for the saleprice column and get some more insight, like what's the minimum price, maximum price 

In [None]:
train_dataset.get("SalePrice").describe()

In [None]:
f, ax = plt.subplots(figsize=(16, 16))
sns.distplot(train_dataset.get("SalePrice"), kde=False)
plt.show()

# Now lets look at the correlation between different features of the dataset

In [None]:
corrmat = train_dataset.corr()
f, ax = plt.subplots(figsize=(16, 16))
sns.heatmap(corrmat, vmax=.8, square=True)

#### Its hard to find features that are most correlated to saleprice

# Lets find top 10 features that best affects SalePrice and plot the heatmap for it

In [None]:
plt.figure(figsize=(16,16))
columns = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index
correlation_matrix = np.corrcoef(train_dataset[columns].values.T)
sns.set(font_scale=1.25)
heat_map = sns.heatmap(correlation_matrix, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=columns.values, xticklabels=columns.values)
plt.show()

# Taking care of missing data
### Now since we got the most features, lets try to look deeper into the data and find other features that may affect the SalePrice
#### There may also be some features that may not affect the SalePrice due to less data or some other data discrepancy

### - The best way to find out is taking care of the missing data

In [None]:
# Taking care of missing data

# First find out all the columns that have missing data and arrange in descending order
total = train_dataset.isna().sum().sort_values(ascending=False)

# concatenate this data into dataframe
missing_data = pd.concat([total], axis=1, keys=["Total"])

Lets view the columns that have the most missing data

In [None]:
missing_data.head(30)

## There are three options to deal with the missing data
- Delete the columns
- Fill the missing data with mean or mode
- Delete only the row
Since columns like `PoolQC`, `MiscFeature`, `Alley` etc. have more missing data we can drop it.
I personally chose to delete all the columns which has missing data more than 1.
For `Electrical` feature, let's drop the row for that index

In [None]:
# dropping the columns where missing data is more than 1
train_dataset = train_dataset.drop((missing_data[missing_data.get("Total") > 1]).index, 1)

# Drop the row entry
train_dataset = train_dataset.drop(train_dataset.loc[train_dataset.get("Electrical").isna()].index)

### Check if there is any missing data remaining

In [None]:
train_dataset.isna().sum().max()

### Look at the shape of the training data after removing the columns

In [None]:
train_dataset.shape

The columns has reduced from 81 to 63

### Taking care of the categorical data
#### Since we cannot measure categorical variables it makes sense to convert it into numbers to keep track or measure them
# Encoding the categorical variables

In [None]:
# Encoding the categorical variables with one hot encoding

# First getting all the columns with categories
categories = list(train_dataset.select_dtypes(["object"]))

# Applying one hot encoding 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categories)], remainder='passthrough')

In [None]:
X = train_dataset.drop(['Id', 'SalePrice'], axis=1)

In [None]:
X

### It should be noted that test dataset and the train dataset should have same number of columns to transform the test data 

In [None]:
print(X.shape)
test_dataset = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test_dataset = test_dataset.drop((missing_data[missing_data.get("Total") > 1]).index, 1)
print(test_dataset.shape)

In [None]:
X = ct.fit_transform(X)

In [None]:
X.shape

In [None]:
X_test = test_dataset.drop(["Id"], axis=1)

In [None]:
test_dataset.info()

### Taking care of missing data in the test dataset 
### - If we remove the columns according to missing data inside the test dataset then there may occur a situation where the shape of dataset may change and so we will not be able to apply encoding on the dataset
#### To tackle this situation we will fill the missing data with mean and mode
- Mean will be for integer or float datatypes and Mode will be for categorical variables

In [None]:
# Looping through all the missing data columns
for i in X_test.isna().columns:
    # Checking if the datatype is not an object and replacing it with mean value
    if X_test.dtypes[i] != "object":
        X_test[i] = X_test[i].fillna(X_test[i].mean())
    else:
        X_test[i] = X_test[i].fillna(X_test[i].mode()[0])
X_test.shape


### Check if there is any missing value inside X_test

In [None]:
X_test.isna().sum().max()

- Now since there is no missing value we can transform the data according to the encoding that we applied for the training dataset

In [None]:
X_test = ct.transform(X_test)

In [None]:
X_test.shape

Note that shape of X and X_test is same. It may happen that test dataset may not have all the categorical values that appears in train dataset. In such situations concatenate training and test data and apply encoding so that the shape of columns remain same

In [None]:
# This is the dependent variable from training data which we have to predict
y = train_dataset.SalePrice

### Split the training data into training and validating dataset to train the model. Later depending on the score we can apply same model on the test dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)

# Training the Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
# This prediction values are on the training dataset
y_pred = regressor.predict(X_val)

In [None]:
# Score of the model
regressor.score(X_val, y_val)


In [None]:
# getting the RMSE root mean squared error
from sklearn.metrics import mean_squared_error
print(f"Mean square error: {mean_squared_error(np.log2(y_val), np.log2(y_pred))}")
print(f"Root mean square error: {mean_squared_error(np.log2(y_val), np.log2(y_pred), squared=False)}")

# Applying the same model on test dataset and predicting the values

In [None]:
test_preds = regressor.predict(X_test)

In [None]:
test_preds.shape

In [None]:
test_dataset.Id.shape

- Notice that the shape of the rows of the test dataset before modifying data is same as rows after taking care of the missing data of test dataset

# Saving the dataset into csv file for submission

In [None]:
# output = pd.DataFrame({'Id': test_dataset.Id,
#                       'SalePrice': test_preds})
# output.to_csv('submission.csv', index=False)

##### - This is the basic predictions of house prices using basic regression technique. Yet to perform Decision Tree Random Forest, Ridge, Lasso and Elastic Net

#### Linear Regression Scored 0.45

# Training DecisionTreeRegressor model on the whole dataset

In [None]:
from sklearn.tree import DecisionTreeRegressor
dregressor = DecisionTreeRegressor(max_depth=10, random_state=142)
dregressor.fit(X_train, y_train)

In [None]:
y_preds = dregressor.predict(X_test)

In [None]:
dregressor.score(X_val, y_val)

In [None]:
# output = pd.DataFrame({'Id': test_dataset.Id,
#                       'SalePrice': y_preds})
# output.to_csv('submission.csv', index=False)

- Before the max_depth was 2 and it changed the score from 0.45 to 0.28
- Now again when max_depth was changed to 10 then the score changed from 0.28 to 0.20037

# Ensemble Methods

# Applying Random Forest Regression on dataset

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(max_depth=15, n_estimators=100, random_state=42)
rf_regressor.fit(X, y)

In [None]:
y_preds = rf_regressor.predict(X_test)

- RandomForestRegressor score was 0.153 when n_estimators were changed the score changed to 0.15065
- n_estimators = 100 and max_depth=15 changed the score to 0.14542

## Applying Boosting Techniques like Gradient Boosting and XGBoost

# Gradient Boosting
- Applying it from hands on machine learning book

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=15, n_estimators=100, learning_rate=1.0)
gbrt.fit(X, y)

In [None]:
y_preds = gbrt.predict(X_test)

- This is not better than random forest regressor

# Applying XGBoost

In [None]:
import xgboost

# Training the model on dataset

In [None]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X, y)

In [None]:
y_preds = xgb_reg.predict(X_test)

- The xgboost gives better score than any other model above.

# Applying regularized Linear models
- Ridge
- Lasso
- ElasticNet
Idea behind this is to constraint the weights of the model

# Ridge Regression

### Making a linear ridge regressor model

In [None]:
# from sklearn.linear_model import Ridge
# ridge_reg = Ridge(alpha=1, solver="auto")

### Training the data 

In [None]:
# ridge_reg.fit(X, y)
# ridge_reg.score(X_val, y_val)

### Predicting the data

In [None]:
# y_preds = ridge_reg.predict(X_test)

# Lasso Regression
- Benefit is that it will try to completely eliminate the least important features

In [None]:
# from sklearn.linear_model import Lasso
# # alpha_values = np.arange(0, 1, 0.1).tolist()

### Finding the best alpha values

In [None]:
# Apply when you want to find the best max_score
# max_score = 0
# best_alpha_value = 0
# for i in alpha_values:
#     lasso_reg = Lasso(alpha=i)
#     lasso_reg.fit(X_train, y_train)
#     current_score = lasso_reg.score(X_val, y_val)
#     print(f"Score and alpha value: {current_score} ---- {i}")
#     if current_score > max_score:
#         max_score = current_score
#         best_alpha_value = i
# print(max_score, best_alpha_value)

### Training the model

In [None]:
# from sklearn.model_selection import GridSearchCV
# alpha_space = np.linspace(0, 1, 50)
# params_grid = {'alpha':alpha_space}
# lasso = Lasso()

# lasso_cv = GridSearchCV(lasso, params_grid, cv=10, scoring = 'neg_root_mean_squared_error')
# lasso_cv.fit(X_train,y_train)
# lasso_cv.score(X_train,y_train)

In [None]:
# y_preds = lasso_cv.predict(X_test)

# ElasticNet Regressor

In [None]:
# Uncomment when you want to apply elastic net strategy
# from sklearn.linear_model import  ElasticNet
# elastic_net_regressor = ElasticNet(alpha=0.1, l1_ratio=0.5)
# elastic_net_regressor.fit(X_train, y_train)
# y_preds = elastic_net_regressor.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': test_dataset.Id,
                      'SalePrice': y_preds})
output.to_csv('submission.csv', index=False)

- Till now the best approach was using xgboost.
- We applied almost all regressor models
#### - It is worthy to note that to improve the accuracy of our model we can approach the problem a little differently while doing Data Preprocessing and feature engineering and then again apply the same model and see which one is better. 
#### Another approach can be applying the feature engineering and then training the data on a blend of model
