# Housing Data Linear Regression

## Necessary Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

## EDA and Preliminary Data Visualization 

In [None]:
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
display(df.head())

In [None]:
df.info()

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(df.describe())


Plotting histograms for all features

In [None]:
df.hist(bins=50, figsize=(30,15))
plt.show()

Correlation Matrix to facilitate selecting features

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(df.corr())

A closer peek into the intricacies of the coorelation matrix:

In [None]:
correlation=df.corr(method='pearson').abs()
sortedCorrelation=correlation.unstack().sort_values(ascending=False)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(sortedCorrelation)

Despite how tacky the above matrices look, they shed light on redundant features, not because they mainly comprise null values or aren't coorelated to the target data (SalePrice) but because they are very highly coorelated with one another. These attributes happen to be:
* GarageCars     GarageArea
* GarageCars     GarageArea
* TotRmsAbvGrd   GrLivArea
* TotalBsmtSF    1stFlrSF
<br> Hence, 4 of the above features will be removed.

However, before taking further steps it's crucial to clean the data at hand before revisting the correlation matrix

## Data Cleaning

As seen above, some of the attributes consist mainly of null values (for instance, MiscFeature and PoolQC). Hence these features will hardly contribute anything to the model and will need to be removed since more than 5% of these features are null. I will also delete the columns discussed above.

In [None]:
dropColumns=['Alley','FireplaceQu','PoolQc','Fence','MiscFeature','TotalBsmtSF','TotRmsAbvGrd', 'YearBuilt','GarageArea']
dfNew=df.loc[:, ~df.columns.isin(dropColumns)]

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numOnly = dfNew.select_dtypes(include=numerics)
for (columnName, columnData) in numOnly.iteritems():
    print('Colunm Name : ', columnName)
    print(columnData.corr(dfNew['SalePrice']))

Dropping more columns due to low correlation

In [None]:
dropColumns2=['YrSold','MoSold','MiscVal','3SsnPorch','BsmtHalfBath','LowQualFinSF','BsmtFinSF2','OverallCond','MSSubClass']
dfNew=dfNew.loc[:, ~dfNew.columns.isin(dropColumns2)] #both categorical and numerical
numOnly=numOnly.loc[:, ~numOnly.columns.isin(dropColumns2)] #only numerical data 

### Handling Missing Data (Numerical Columns Only)

In [None]:
y=numOnly['SalePrice']
X=numOnly.drop(labels='SalePrice', axis=1)
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mode.fit(X)
X=imp_mode.transform(X)

# Preliminary Linear Regression with Numerical Columns Only

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=42)
reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)


## What if the data was normalized?

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y,test_size = 0.3, random_state=42)
reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

I believe it seems to hardly do anything in this case

# Preliminary Cross-Validation with Numerical Columns Only 

In [None]:
reg = LinearRegression()
cv_results = cross_val_score(reg, X, y, cv=5)
print(cv_results)
np.mean(cv_results)

It actually performed worse which is rather pitiful

# An Attempt at Regularized Regression

In [None]:
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
for i in range(10,20,1):
    
    ridge = Ridge(alpha=i/10000000, normalize=True)
    ridge.fit(X_train, y_train)
    ridge_pred = ridge.predict(X_test)
    print(ridge.score(X_test, y_test))

# An Attempt at Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.3, random_state=42)
lasso = Lasso(alpha=0.0001, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)