In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score



%matplotlib inline


In [None]:
sale_price_train = pd.read_csv("train.csv", index_col='Id')
sale_price_train.head()


## Checking if duplicates existed

In [None]:
sale_price_train.duplicated().any()

In [None]:
sale_price_train.info()

## Splitting data into categorical and non-categorical(continuous) columns

In [None]:
continuous_cols= list(sale_price_train.describe().columns)

categorical_cols = [i for i in sale_price_train.columns if i not in continuous_cols]

In [None]:
continuous_data = sale_price_train.loc[:, continuous_cols]
categorical_data = sale_price_train.loc[:, categorical_cols]

## Cleaning Continuous Data

In [None]:
cont_d = continuous_data.isnull().any()
missing_continuous = list(cont_d[cont_d == True].index)
missing_continuous

In [None]:
print(continuous_data['LotFrontage'].isna().sum()/1460*100)
print(continuous_data['MasVnrArea'].isna().sum()/1460*100)
print(continuous_data['GarageYrBlt'].isna().sum()/1460*100)



In [None]:
##The above values are missing%. We replace them with median.

continuous_data['LotFrontage'].fillna((continuous_data['LotFrontage'].median()),inplace = True)
continuous_data['MasVnrArea'].fillna((continuous_data['MasVnrArea'].median()),inplace = True)
continuous_data['GarageYrBlt'].fillna((continuous_data['GarageYrBlt'].median()),inplace = True)

In [None]:
##Drop SalePrice column
labels = continuous_data.SalePrice
continuous_data.drop("SalePrice", axis=1, inplace=True)
continuous_cols = list(continuous_data.columns)

In [None]:
plt.style.use("classic")
continuous_data.hist(bins=50, figsize=(25,25))
plt.show()

## Since many histograms are tail-heavy, we scale data to have bell-shaped distribution. This process is Feature Scaling.

In [None]:
sc_x = StandardScaler()
continuous_data = sc_x.fit_transform(continuous_data)

In [None]:
print(continuous_data)

## Cleaning Categorical Data

In [None]:
cat_d = categorical_data.isnull().any()
missing_categorical = list(cat_d[cat_d == True].index)
missing_categorical

In [None]:
for i in missing_categorical:
    print(i + " = " + str(categorical_data[i].isna().sum()/1460*100))

In [None]:
drop_categorical = ["Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"]
categorical_data.drop(drop_categorical, axis=1, inplace=True)
categorical_cols = list(categorical_data.columns)

In [None]:
f, axes = plt.subplots(8, 5, figsize=(20, 35))

for ax, col in zip(axes.ravel(), categorical_data.columns):
    target = categorical_data[col].value_counts()
    ax.bar(target.index, target)
    ax.set_title(col)

In [None]:
##Since we see positive skewness, we replace missing values with most frequent value.

categorical_data = categorical_data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [None]:
##Encoding every column in categorical variable using Label Encoding

for i in categorical_data:
    categorical_data[i] = categorical_data[i].astype('category')
    categorical_data[i] = categorical_data[i].cat.codes
categorical_data.head()


In [None]:
print(categorical_data)

In [None]:
type(continuous_data)

In [None]:
continuous_data

In [None]:
data1 = pd.DataFrame(data=continuous_data, columns=continuous_cols)

In [None]:
data1.head()

In [None]:
categorical_data.head()

In [None]:
new_col = range(1, len(data1) + 1)

In [None]:
data1.insert(loc=0, column='ID', value=new_col)

In [None]:
data1.head()

In [None]:
categorical_data.insert(loc=0, column='ID', value=new_col)

In [None]:
categorical_data.head()

In [None]:
final_df = pd.merge(left=data1, right=categorical_data, on = "ID")

In [None]:
final_df.head()

In [None]:
sale_price_train.head()

In [None]:
final_df['SalePrice'] = sale_price_train['SalePrice']

In [None]:
final_df['SalePrice'] = final_df['SalePrice'].shift(-1)

In [None]:
final_df

In [None]:
final_df.iloc[1459,-1] = 147500

## Feature Scaling using Correlation matrix

In [None]:
plt.figure(figsize=(20,20))
cor = final_df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Correlation with output variable
cor_target = abs(cor["SalePrice"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
relevant_features

In [None]:
X_train = final_df[['OverallQual','YearBuilt','YearRemodAdd','TotalBsmtSF','1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','GarageCars','GarageArea','ExterQual','BsmtQual','KitchenQual','GarageFinish']]
X_train.head()

In [None]:
plt.figure(figsize=(20,20))
cor = X_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
X = X_train.drop(columns = ['ExterQual','KitchenQual'])

## Linear regression implementation


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics



In [None]:
Y = final_df['SalePrice']
train_x, test_x, train_y, test_y = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(train_x, train_y)

In [None]:
pred_y = lr.predict(test_x)

In [None]:
mse = metrics.mean_squared_error(test_y, pred_y)
print("Mean Squared Error {}".format(mse))
print("R2 score =", round(metrics.r2_score(test_y, pred_y), 2))
print("Explain variance score =", round(metrics.explained_variance_score(test_y, pred_y), 2)) 