In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Hello Kagglers!
## This will be a basic walkthrough to using **Linear Regression only!** to predict Sale prices
## Feedback would be very appreciated,, cheer on your machine learning noobie


In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#polynomial
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score

In [None]:
#load test and train data
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_train

## First off, we will start by using ***continous features*** only and drop useless features

In [None]:
#notice Overall Quality is very highliy correlated
corr_y = df_train.corr()
corr_y['SalePrice'].sort_values(ascending=False).abs()[1:]

### notice some high correlated features like overall quality and grlivarea!

In [None]:
#select continous features only
num_columns = [col for col in df_train.columns if df_train[col].dtype == 'int64']

In [None]:
corr = df_train[num_columns].corr().abs()

mask = np.triu(np.ones_like(corr, dtype=np.bool))

plt.figure(figsize=(15,8))

# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
# yticks
#plt.yticks(rotation=0)
plt.show()

In [None]:
# Dropping useless features
num_col2 = [x for x in num_columns if x!='Id' and x!='MoSold' ]
num_col2 , len(num_col2)

In [None]:
# splitting feature and target 
train = df_train[num_col2]
x_train = preprocessing.scale(train.iloc[:,:-1])
y_train = np.log1p(train.iloc[:,-1:])
sns.distplot(y_train) # salesprice now symmetric

## Plotting different features against Prices

In [None]:
for i in range(0, len(train.columns), 5):
        sns.pairplot(data=train,
                    x_vars=train.columns[i:i+5],
                    y_vars=['SalePrice'])

### There appears to be multple feauture with a linear relation with Sale price and other useless features

## We start bulding a baseline model for our linear regression problem using all features
### Since we have a relatively small dataset, the model is cross validated with K-folds = 4 


In [None]:
lin_reg = LinearRegression()
#lin_reg.fit(x_train, Y_train)
cross_val_score(lin_reg, x_train, y_train, cv=4, scoring='neg_root_mean_squared_error')

### Baseline model seams to be doing well but with high variance in model, let's try adding linear regression hyperparamteres


In [None]:
sgd_reg = SGDRegressor(max_iter=5000, tol=-np.infty, penalty=None, eta0=0.005, random_state=42)
cross_val_score(sgd_reg, x_train, y_train, cv=4, scoring='neg_root_mean_squared_error')

### There seems to be a pattern in predicting observation groups, so we will use shuffled k-folds

In [None]:
kf = KFold(4, shuffle=True, random_state=1)
cross_val_score(sgd_reg, x_train, y_train.to_numpy().ravel(), cv=kf, scoring='neg_root_mean_squared_error')

## Now, We try to find our most important models, clearly can be observed by thetas and correlation too.

In [None]:
lin_reg.fit(x_train, y_train)
lin_reg.intercept_, lin_reg.coef_ # notice some very high thetas

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

In [None]:
sfs1 = SFS(lin_reg, k_features=29, forward=True, floating=False, scoring='neg_root_mean_squared_error',cv=kf )

In [None]:
sfs1 = sfs1.fit(x_train, y_train)
fig = plot_sfs(sfs1.get_metric_dict(), kind='std_err')
print("Selected Features :", sfs1.k_feature_names_)
print("Selected Features ID :", sfs1.k_feature_idx_)
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

In [None]:
df = pd.DataFrame.from_dict(sfs1.get_metric_dict()).T
df[["feature_idx","avg_score"]]

In [None]:
df.sort_values(['avg_score'],ascending=False)

In [None]:
df.sort_values(['avg_score'],ascending=False).iloc[0,1]

In [None]:
selected = df.sort_values(['avg_score'],ascending=False).iloc[4,0]
selected = list(selected) 

### Filter selected features with least error

In [None]:
selected_cols = [x for x in train.iloc[:,selected].columns]
selected_cols

#### found our top important continuous features using  forward Sequential Feature Search

In [None]:
train2 = df_train[selected_cols]
x_train2 = preprocessing.scale(train2)
y_train2 = np.log1p(train.iloc[:,-1:])

## After selecting our features and testing our model, we will begin grid searching on models such as sgd since using simple linear regression method wasn't able to solve for testing data.

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
lr = np.linspace(0.001, 0.1, 20)



clf = GridSearchCV(estimator=SGDRegressor(), cv=kf, param_grid=dict(eta0=lr,alpha=lr,penalty=['l1','elasticnet',None]), n_jobs=-1, scoring='neg_root_mean_squared_error')
clf.fit(x_train2, y_train2.to_numpy().ravel())        

clf.best_score_                                  

#clf.best_estimator_.C                            


# Prediction performance on test set is not as good as on train set
clf.score(x_train2, y_train2.to_numpy().ravel())  

## Managed to increase model accuracy by selecting feautures and tuning hyperparamters

In [None]:
cross_val_score(clf, x_train2, y_train2.to_numpy().ravel(), cv=kf, scoring='neg_root_mean_squared_error')

# Using our SGDRegressor with optimum parameters, now we predict testing data and export the final predicitons

In [None]:
test = df_test[selected_cols]
test.fillna(test.mean(), inplace=True)
x_test = preprocessing.scale(test)


In [None]:
y = np.expm1(clf.predict(x_test))
y

In [None]:
subm = pd.DataFrame({'Id': df_test.Id ,'SalePrice' : y})
subm

 ### anddd submit!  

In [None]:
subm.to_csv('SGD2.csv',index=False)