In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor,Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_validate
from sklearn.svm import SVR


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
housing = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
housing_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Excluding Categorical Variables

In [None]:
categorical = []
for i in housing.columns:
    if housing[i].dtype == object:
        categorical.append(i)
categorical

In [None]:
housing = housing.drop(columns = categorical)
housing.info()

# Checking for Missing Values

In [None]:
total = housing.isnull().sum().sort_values(ascending=False)
percent = (housing.isnull().sum()/housing.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

### All the percentages are low so I'll just fill the values with the medians.

In [None]:
housing = housing.fillna(housing.median())
housing.isnull().sum()

# Correlations

In [None]:
corr = housing.corr()
corr
plt.figure(figsize=(25,25))
sns.heatmap(corr, annot = True)
plt.show()


## Variables with High Correlations
#### YearBuilt and GarageYrBlt = 0.78
#### TotalBsmtSF, 1stFlrSF = 0.83
#### GarageCars, GarageArea = 0.88

In [None]:
housing = housing.drop(columns = ['GarageYrBlt', '1stFlrSF', 'GarageArea']).set_index('Id')
housing

In [None]:
corr_price = housing.corr()["SalePrice"].sort_values(ascending = False)
corr_price

# Checking for skewed features

In [None]:
skewed_col = []
for i in housing.columns:
    if housing[i].skew() > 0.5:
        skewed_col.append(i)


In [None]:
len(skewed_col)

## Applying log transformation to solve the skewness problem

In [None]:
housing[skewed_col] = housing[skewed_col].apply(lambda i: np.log1p(i))

In [None]:
x = housing.iloc[:,:-1]
y = housing.iloc[: , -1]

In [None]:
#Checking the distribution of the dependent variable
sns.distplot(y)

# Standardizing the Data

In [None]:
sc = StandardScaler()
x = sc.fit_transform(x)

# Linear Regression

In [None]:
lr = LinearRegression()
cv_lr = cross_validate(lr,x, y,cv = 10,scoring = 'neg_root_mean_squared_error')
cv_lr['test_score'].mean()

# SVR 

In [None]:
svr = SVR(kernel = 'linear', C =1)
cv_svr = cross_validate(svr,x, y,cv = 10,scoring = 'neg_root_mean_squared_error')
cv_svr['test_score'].mean()

# SGD Regressor

In [None]:
sgd = SGDRegressor()
cv_sgd = cross_validate(sgd,x, y,cv = 10,scoring = 'neg_root_mean_squared_error')
cv_sgd['test_score'].mean()

# Ridge

In [None]:
ridge = Ridge()
cv_r = cross_validate(ridge,x, y,cv = 10,scoring = 'neg_root_mean_squared_error')
cv_r['test_score'].mean()

# Lasso

In [None]:
lasso = Lasso(alpha = 0.001)
cv_l = cross_validate(lasso,x, y,cv = 10,scoring = 'neg_root_mean_squared_error')
cv_l['test_score'].mean()

# ElasticNet

In [None]:
en = ElasticNet(alpha = 0.001)
cv_en = cross_validate(en,x, y,cv = 10,scoring = 'neg_root_mean_squared_error')
cv_en['test_score'].mean()

# Test Data

In [None]:
housing_test = housing_test.drop(columns = categorical)

In [None]:
housing_test = housing_test.drop(columns = ['GarageYrBlt', '1stFlrSF', 'GarageArea']).set_index('Id')

In [None]:
total_test = housing_test.isnull().sum().sort_values(ascending=False)
percent_test = (housing_test.isnull().sum()/housing.isnull().count()).sort_values(ascending=False)
missing_data_test = pd.concat([total_test, percent_test], axis=1, keys=['Total', 'Percent'])
missing_data_test.head(10)

In [None]:
housing_test = housing_test.fillna(housing_test.median())

In [None]:
skewed_col_test = []
for i in housing_test.columns:
    if housing_test[i].skew()>0.5:
        skewed_col_test.append(i)
skewed_col_test

In [None]:
housing_test[skewed_col[:-1]] = housing_test[skewed_col[:-1]].apply(lambda i: np.log1p(i))

In [None]:
housing_test = sc.transform(housing_test)

In [None]:
final = SVR(kernel = 'linear', C =1)
final.fit(x,y)

In [None]:
pred = np.expm1(final.predict(housing_test))
pred

In [None]:
submission = pd.DataFrame({'Id': range(1461, 1461 + len(housing_test)), 'SalePrice': pred})

In [None]:
submission.to_csv('submission.csv', index = False)