In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.columns

In [None]:
train.describe()

In [None]:
train['SalePrice'].describe()

In [None]:
sns.distplot(train['SalePrice'])
plt.show();

As we can see our target table the SalePrice has little skew to the right , Lets know how exactly is the Skewness

In [None]:
print("Skewness: %f" % train['SalePrice'].skew())
print("Kurtosis: %f" % train['SalePrice'].kurt())

- The Values for asymmetry and kurtosis between -2 and +2 are considered acceptable in order to prove normal univariate distributionThe values for asymmetry and kurtosis between -2 and +2 are considered acceptable in order to prove normal univariate distribution that data is considered to be normal if skewness is between ‐2 to +2 and kurtosis is between ‐7 to +7

Now lets see how is the correlation going with the most important features

###### Outliers

In [None]:
sns.scatterplot(x = 'TotalBsmtSF', y='SalePrice', data = train)
plt.show()

In [None]:
sns.scatterplot(x = 'GrLivArea', y = 'SalePrice', data = train)
plt.show()

Well it's obvious that these 2 features has strong corelation with sale price but there is some outliers in TotalBsmtSf in around 6000 and in GrLivArea in around 4000 and 5000 ,It would be better if we remove them

In [None]:
train = train.drop(train[(train['TotalBsmtSF']>5000) & (train['SalePrice']<30000)].index)
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

Now lets check again

In [None]:
sns.scatterplot(x = 'TotalBsmtSF', y='SalePrice', data = train)
plt.show()

In [None]:
sns.scatterplot(x = 'GrLivArea', y = 'SalePrice', data = train)
plt.show()

In [None]:
sns.scatterplot(x = 'YearBuilt', y = 'SalePrice', data = train)
plt.show()

Altough this don't tell us so much but I think that newer house has higher price

###### Correlation Matrix

In [None]:
correlation = train.corr()
fig, axes = plt.subplots(figsize=(15, 12))
sns.heatmap(correlation, vmax=.8);

ok some columns have high correlation between out target value, so we will make a zoomed Heatmap for more Details

In [None]:
k = 10 #number of variables for heatmap
cols = correlation.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.2)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

now lets do some scatterplots between theese features

In [None]:
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols], size = 2.5)

### Missing values and Feature Engineering

In [None]:
train.isnull().sum().sum()

In [None]:
# pivot table for the missing values
total = train.isnull().sum().sort_values(ascending=False)
percentage = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percentage], axis=1, keys=['Total', 'Percentage'])
missing_data.head(20)

Ok, these are much missing values so it would ber better to remove all columns with missing values as these features are categorical features so it would meaningless if we replaced the missing values with the mean, and I will keep the electrical as it's just 1 column and it  has a little high correlation 

In [None]:
train = train.drop((missing_data[missing_data['Total'] > 1]).index, 1)
train = train.drop(train.loc[train['Electrical'].isnull()].index)

In [None]:
train.isnull().sum().sum()

In [None]:
from scipy.stats import norm
from scipy import stats

In [None]:
sns.distplot(train['SalePrice'], fit=norm);

In [None]:
fig = plt.figure()
prob = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

The SalePrice column is right skewed and Linear model love normal distributed so so we will make it normally distributed

In [None]:
train['SalePrice'] = np.log1p(train['SalePrice'])
sns.distplot(train['SalePrice'], fit=norm);

In [None]:
fig = plt.figure()
prob = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

In [None]:
train = pd.get_dummies(train)

In [None]:
train

## Machine Learning Models

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score
from math import sqrt

In [None]:
scaler = StandardScaler()
X = train.drop('SalePrice', axis=1)
y = train[['SalePrice']]
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=42)

print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

### 1- Normal Linear Regression

In [None]:
l_r = LinearRegression()
l_r.fit(X_train, y_train)
y_train_pred = l_r.predict(X_train)
y_test_pred = l_r.predict(X_test)
scorer = make_scorer(mean_squared_error, greater_is_better = False)
rmse_train = np.sqrt(-cross_val_score(l_r, X_train, y_train, scoring = scorer, cv=10))
rmse_test = np.sqrt(-cross_val_score(l_r, X_test, y_test, scoring = scorer, cv=10))
print ('Mean RMSE for training set is',rmse_train.mean())
print ('Mean RMSE for the test set is',rmse_test.mean())

In [None]:
plt.scatter(y_train_pred, y_train_pred - y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test_pred - y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regresion")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

In [None]:
plt.scatter(y_train_pred, y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regresion")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

Root Mean Square Error (RMSE) is the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are; RMSE is a measure of how spread out these residuals are. In other words, it tells you how concentrated the data is around the line of best fit.

### 2- Ridge Regression

In [None]:
regr_cv = RidgeCV(alphas=[0.1, 1.0, 8 ,9 ,10.0 ,11 ,12 ,15, 20, 25, 30, 35, 40, 50])
model_cv = regr_cv.fit(X_train, y_train)
print ('Best Alpha is', model_cv.alpha_)
y_train_pred = model_cv.predict(X_train)
y_test_pred = model_cv.predict(X_test)
rmse_train = np.sqrt(-cross_val_score(model_cv, X_train, y_train, scoring = scorer, cv=10))
rmse_test = np.sqrt(-cross_val_score(model_cv, X_test, y_test, scoring = scorer, cv=10))
print ('Mean RMSE for training set is',rmse_train.mean())
print ('Mean RMSE for the test set is',rmse_test.mean())

In [None]:
plt.scatter(y_train_pred, y_train_pred - y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test_pred - y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regresion")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

In [None]:
plt.scatter(y_train_pred, y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regresion")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

### Predicting test and submiting

In [None]:
test

In [None]:
# test = pd.get_dummies(test)
# test.head()
# predicted_prices = model_cv.predict(test)
# my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# my_submission.to_csv('submission.csv', index=False)