In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# First with all features

# Data Preprocessing

In [None]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df = train_df.select_dtypes(include=np.number)
train_df.head()

In [None]:
train_df.info()

In [None]:
#columns that have null vals
#LotFrontage
#MasVnrArea     
#GarageYrBlt    
train_df[['GarageYrBlt','MasVnrArea','LotFrontage']].describe()

In [None]:
train_df[['GarageYrBlt','MasVnrArea','LotFrontage']].head()

In [None]:
train_df['MasVnrArea'].plot.kde();

In [None]:
train_df['MasVnrArea'].median()

In [None]:
train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].median(),inplace = True)
train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].median(),inplace = True)
train_df['LotFrontage'].fillna(train_df['LotFrontage'].median(),inplace = True)
train_df.info()

In [None]:
train_df[['GarageYrBlt','MasVnrArea','LotFrontage']].describe()

In [None]:
(train_df.SalePrice).plot.kde()

In [None]:
(np.log(train_df.SalePrice)).plot.kde();

In [None]:
# preprocess the data
from sklearn.preprocessing import StandardScaler as SC
sc_x = SC()
x = np.array(train_df.drop(columns = {'SalePrice','Id'}))
y = np.array(train_df['SalePrice']).reshape([1460,1])
x = sc_x.fit_transform(x)
y = np.log1p(y)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

# Linear and SGD

In [None]:
def evaluate(model):
    score = r2_score(y_test,model.predict(x_test))
    rmse = mse(y_test,model.predict(x_test),squared=False)
    print('Test data results',rmse,score)
    score = r2_score(y_train,model.predict(x_train))
    rmse = mse(y_train,model.predict(x_train),squared=False)
    print('Train data results',rmse,score)

In [None]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
linreg = LR()
linreg.fit(x_train,y_train)

evaluate(linreg)

In [None]:
from sklearn.linear_model import SGDRegressor as sgd
sg = sgd()
sg.fit(x_train,y_train.ravel())

evaluate(sg)

# Ridge, Lasso, ElasticNet

In [None]:
from sklearn.linear_model import Ridge
rid = Ridge(alpha = 0.5)
rid.fit(x_train,y_train)

evaluate(rid)

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.0005)
lasso.fit(x_train,y_train)

evaluate(lasso)

In [None]:
from sklearn.linear_model import ElasticNet
elnet = ElasticNet(alpha = 0.1,l1_ratio = 0.005)
elnet.fit(x_train,y_train)

evaluate(elnet)

# SVR

In [None]:
from sklearn.svm import SVR
rbf = SVR(kernel='rbf', C=7, gamma=0.002)
rbf.fit(x_train,y_train.reshape(y_train.shape[0],))

evaluate(rbf)

In [None]:
lin = SVR(kernel='linear', C=5)
lin.fit(x_train,y_train.reshape(y_train.shape[0],))

evaluate(lin)

In [None]:
poly = SVR(kernel='poly', C=0.15, degree=5)
poly.fit(x_train,y_train.reshape(y_train.shape[0],))

evaluate(poly)

# With some Features

In [None]:
train_df.head()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(abs(train_df.corr())>0.4,cmap="YlGnBu");

## sale price is highly correlated with 
('OverallQual','YearBuilt',
 'YearRemodAdd','MasVnrArea','TotalBsmtSF', '1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd'
    ,'Fireplaces' ,'GarageCars','GarageArea')

In [None]:
train_df.columns

In [None]:
columns = ['OverallQual','YearBuilt', 'YearRemodAdd','MasVnrArea','TotalBsmtSF'
           , '1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','Fireplaces'
           ,'GarageCars','GarageArea','SalePrice']
train_df[columns].head()

In [None]:
#might use if for feature engineering
plt.scatter(np.power(0.5*train_df['YearRemodAdd']*train_df['YearBuilt'],1/3),np.log(train_df['SalePrice']));

In [None]:
train_df['yearbetween'] = np.power(0.5*train_df['YearRemodAdd']*train_df['YearBuilt'],1/3)
columns = ['OverallQual','YearBuilt', 'YearRemodAdd','MasVnrArea','TotalBsmtSF'
           , '1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','Fireplaces'
           ,'GarageCars','GarageArea','SalePrice','yearbetween']
train_df[columns].head()

In [None]:
train_df[['yearbetween','SalePrice','YearBuilt','YearRemodAdd']].corr()

In [None]:
sc_new = SC()
x_new = np.array(sc_new.fit_transform(train_df[columns].drop(columns = {'SalePrice'})))
y_new = np.log1p(train_df['SalePrice'])
x_new

In [None]:
xtrain_new, xtest_new, ytrain_new, ytest_new = train_test_split(x_new, y_new, test_size=0.15, random_state=42)

# Use RBF SVR as it had the best performance on all features

In [None]:
svr_new = SVR(kernel='rbf', C=15, gamma=0.002)
svr_new.fit(xtrain_new,ytrain_new)

score = r2_score(ytest_new,svr_new.predict(xtest_new))
rmse = mse(ytest_new,svr_new.predict(xtest_new),squared=False)
print('Test data results',rmse,score)
score = r2_score(ytrain_new,svr_new.predict(xtrain_new))
rmse = mse(ytrain_new,svr_new.predict(xtrain_new),squared=False)
print('Train data results',rmse,score)

In [None]:
evaluate(rbf)

### Notes: 
1-need to perform better feature selection <br>
2-add categorical data <br>
3-try other models

# Submit answers with svr-rbf trained on all data

In [None]:
svr_rbf = SVR(kernel='rbf', C=8, gamma=0.1)
svr_rbf.fit(x,y.ravel())

In [None]:
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_df.fillna(0,inplace = True)
test_df.head()

In [None]:
x_sub = np.array(test_df[train_df.drop(columns={'SalePrice','Id','yearbetween'}).columns])
x_sub = sc_x.transform(x_sub)
x_sub

In [None]:
h_sub = svr_rbf.predict(x_sub)
h_sub = np.exp(h_sub)
h_sub

In [None]:
sub_df = pd.DataFrame({'Id':list(test_df['Id'])})
sub_df['SalePrice'] = h_sub
sub_df

In [None]:
sub_df.to_csv('submission.csv',index=False)