In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
data

In [None]:
data.hist(bins=50,figsize=(20,20))
plt.show()

In [None]:
data.SalePrice.hist()
plt.show()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
from sklearn.model_selection import train_test_split
train_set,test_set = train_test_split(data,test_size=0.2,random_state=10)
len(test_set)

In [None]:
len(train_set)

In [None]:
train = train_set

In [None]:
corr_matrix = train.corr()

In [None]:
(corr_matrix.SalePrice).sort_values(ascending=False)

In [None]:
(corr_matrix.SalePrice).sort_values(ascending=False)[1:11]

In [None]:
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

# Spliting target 

In [None]:
housing = train.drop('SalePrice',axis=1)
target = train.SalePrice

In [None]:
target

# Data cleaning

In [None]:
housing.isna().sum().sort_values(ascending=False)

In [None]:
housing_incomplete_rows = housing[housing.isna().any(axis=1)]

In [None]:
housing_incomplete_rows.head()

In [None]:
housing_incomplete_rows.isna().sum().sort_values(ascending=False) #using isna insull gives same result

In [None]:
from sklearn.impute import SimpleImputer  #instead of imputing median value one by one, we are using sklean SimpleImputer
imputer = SimpleImputer(strategy='median')


# Converting dataframe into two dataframes (categorical and numerical)

In [None]:
temp = []
for i in housing.columns:
    if is_numeric_dtype(train[i]):
        temp.append(i)

In [None]:
housing_num = housing[temp]

In [None]:
temp2 = []
for i in housing.columns:
    if is_string_dtype(housing[i]):
        temp2.append(i)

In [None]:
housing_string = housing[temp2]

In [None]:
housing_string.shape

In [None]:
housing_num.shape

In [None]:
housing.shape #checking if any columns is not missing 43+37 = 80 

In [None]:
housing_cat = housing_string.apply(lambda x: x.astype('category')) #converting columns into category format

In [None]:
housing_num.hist(bins=50,figsize=(20,20))
plt.show()

# Checking Correlation 

In [None]:
housing_num.corrwith(target,axis=0).sort_values(ascending=False)

## Irrelevant Columns 

In [None]:
num_drop_cols = ['BsmtFinSF2','BsmtUnfSF','KitchenAbvGr','YearBuilt','YrSold','LowQualFinSF','MoSold','BsmtUnfSF','GarageCars']

In [None]:
cat_drop_cols = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu']

In [None]:
(housing_num.isna().sum().sort_values(ascending=False)/len(housing_num)) * 100 #checking nan values by percentage

In [None]:
housing_num.LotFrontage.describe()  #mean and mdeain are same, both can be used for imputation for nan values

In [None]:
(housing_cat.isna().sum().sort_values(ascending=False)[:15]/len(housing_cat)) *100   #checking nan values by percentage

# Drop irrelevant columns 

In [None]:
housing_num.drop(num_drop_cols,axis=1,inplace=True)
housing_cat.drop(cat_drop_cols,axis=1,inplace=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder #mentioned handle_unknown = 'ignore' if OHE not able to convert cat to num format
cat_encoder = OneHotEncoder(handle_unknown = 'ignore')

# Use of Transformers and Pipeline

In [None]:
from sklearn.pipeline import Pipeline  #creating pipelines for cat and num data seperately
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
     ('cat', cat_encoder)
])


* housing_num ----- dataset containing all numeric values
* housing_cat ----- dataset containing all categorical values

In [None]:
from sklearn.compose import ColumnTransformer  #combining num and cat data, also the columns that were dropped will be ignored 

num_attribs = housing_num.columns
cat_attribs = housing_cat.columns

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs)
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared #it is compressed sparse matrix, not original dataframe

# Train and Evaluate 

Loss function used - **log_mean_square_log_error** # as large amount of outliers are present in data

Algorithm used-
* 1.Linear Regression
* 2.Decision Tree
* 3.RandomForest

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg  = LinearRegression()
lin_reg.fit(housing_prepared,target)

In [None]:
some_data = housing.iloc[:5]
some_labels = target.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:",lin_reg.predict(some_data_prepared))
print("Labels:",target.values)

In [None]:
### Measure through RMSE

In [None]:
from sklearn.metrics import mean_squared_log_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mslr = mean_squared_log_error(target,housing_predictions)
lin_rmslr = np.sqrt(lin_mslr)
lin_rmslr

In [None]:
### lets try Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,target)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mslr = mean_squared_log_error(target,housing_predictions)
tree_rmslr = np.sqrt(tree_mslr)
tree_rmslr

# Cross validation 

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg,housing_prepared,target,scoring= "neg_mean_squared_log_error",cv=10)
tree_rmsle_scores= np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard Deviation:",scores.std())
  

In [None]:
display_scores(tree_rmsle_scores)

In [None]:
from sklearn.metrics import mean_squared_error   #using mean_squared_error  this time
lin_scores = cross_val_score(lin_reg,housing_prepared,target,scoring='neg_mean_squared_error',cv=10)
lin_rmsle_scores= np.sqrt(-lin_scores)
display_scores(lin_rmsle_scores)

In [None]:
display_scores(lin_rmsle_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,target)

In [None]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_reg.fit_mslr = mean_squared_log_error(target,housing_predictions)
forest_reg.fit_rmslr = np.sqrt(forest_reg.fit_mslr)
forest_reg.fit_rmslr

In [None]:
forest_scores = cross_val_score(forest_reg,housing_prepared,target,scoring='neg_mean_squared_log_error',cv=10)
forest_rmsle_scores= np.sqrt(-forest_scores)

In [None]:
display_scores(forest_rmsle_scores)

# Use of GridSearchCV to identify the optimal hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [{ 'max_features':[2,4,6,8]},
              {'bootstrap':['False'], 'max_features':[2,3,4]}
]

forest_reg = RandomForestRegressor(random_state=42)
grid_search= GridSearchCV(forest_reg,param_grid,cv=5,
                          scoring='neg_mean_squared_log_error',
                          return_train_score=True)

grid_search.fit(housing_prepared,target)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_  # to identify best estimator

In [None]:
final_model = grid_search.best_estimator_

In [None]:
x_test = test_set.drop("SalePrice",axis=1)
y_test = test_set['SalePrice']

x_test_prepared = full_pipeline.transform(x_test)

final_predictions = final_model.predict(x_test_prepared)
final_msle = mean_squared_log_error(y_test, final_predictions)
final_rmsle = np.sqrt(final_msle)

In [None]:
final_rmsle

In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, 
        len(squared_errors) - 1,
        loc = squared_errors.mean(),
        scale=stats.sem(squared_errors)))

In [None]:
original_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
x_test_prepared = full_pipeline.transform(original_test)

In [None]:
x_test_prepared

In [None]:
final_predictions = final_model.predict(x_test_prepared)

# Submission

In [None]:
submission_2 = pd.DataFrame({"Id":original_test.Id,"SalePrice":final_predictions})

In [None]:
submission_2.to_csv('submission_2.csv', index=False)