## 1.1 Loading Data

In [1]:
import numpy as np
import math as math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.figure_factory import create_distplot
import statsmodels.formula.api as smf
from statsmodels.graphics.gofplots import ProbPlot

%matplotlib inline
#plotly.offline.init_notebook_mode(connected=True)

In [2]:
housingdf = pd.read_csv('./Datasets/train.csv')
housingdf = housingdf.drop(['Id'], axis = 1)

In [3]:
testdf = pd.read_csv('./Datasets/test.csv')
testdf = testdf.drop(['Id'], axis = 1)

In [4]:
print(housingdf.shape)
print(testdf.shape)

(1460, 80)
(1459, 79)


In [5]:
housingdf.head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
5,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,307000
7,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,129900
9,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,118000


In [6]:
testdf.head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
5,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,4,2010,WD,Normal
6,20,RL,,7980,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,5,2010,WD,Normal
8,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2010,WD,Normal
9,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [7]:
totaldf = pd.concat([housingdf, testdf], axis = 0, ignore_index = False)


In [8]:
#making a copy
totalhousing = totaldf.copy()

In [9]:
#drop the response variable
housingtrainy = housingdf[['SalePrice']]
totalhousing = totalhousing.drop(['SalePrice'], axis = 1)

In [10]:
totalhousing.shape

(2919, 79)

In [11]:
plt.style.use('seaborn')
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
plt.subplot(1, 2, 1)
sns.distplot(housingdf['SalePrice'])


NameError: name 'matplotlib' is not defined

In [None]:
hist_data = [np.array(np.log1p(housingdf['SalePrice']))]
group_labels = ['Log Sales Price Distribution']
fig = ff.create_distplot(hist_data, group_labels, bin_size=.1)
py.offline.iplot(fig, filename='Basic Distplot')

## 2.1 Imputation & Visualization

In [None]:
import missingno as msno
%matplotlib inline

In [None]:
#overall missing data visualization
msno.matrix(totaldf, color=(0.3, 0.1, 0.7), labels = True)

In [None]:
missingcolumns = totalhousing.isnull().sum().sort_values(ascending = False)
missingpercentage = (totalhousing.isnull().sum()/len(totaldf)).sort_values(ascending = False)
missingdf = pd.DataFrame({'missing_column': missingcolumns.index.tolist(), 'missing_numbers': missingcolumns.values.tolist(), 'missing_percentage': missingpercentage.values.tolist()})
missingdf = missingdf[missingdf['missing_numbers']>0]
missingdf



In [None]:
#missing data with barplots

missing_counts = go.Bar(x=missingdf['missing_column'].tolist(),
                  y=missingdf['missing_numbers'].tolist(),
                  marker=dict(color='rgb(166,206,227)'))


data = [missing_counts]

layout = go.Layout(title="Missing Data Counts",
                xaxis=dict(title='Missing Columns',tickangle=30),
                yaxis=dict(title='Counts'))

fig = go.Figure(data=data, layout=layout)

py.offline.iplot(fig)





In [None]:
#missing data with barplots

missing_percentage = go.Bar(x=missingdf['missing_column'].tolist(),
                  y=missingdf['missing_percentage'].tolist(),
                  marker=dict(color='rgb(166,206,227)'))


data = [missing_percentage]

layout = go.Layout(title="Missing Data Percentage",
                xaxis=dict(title='Missing Columns',tickangle=30),
                yaxis=dict(title='Missing Percentage'))

fig = go.Figure(data=data, layout=layout)

py.offline.iplot(fig)




In [None]:
#correlation between missing data
msno.heatmap(totaldf, cmap = 'PuRd')

In [None]:
totaldf.columns

In [None]:
totalmissingdf = totaldf[missingdf['missing_column'].tolist()]

## 2.2 Garage Year Imputation

In [None]:
# totalhousing.GarageYrBlt = np.where(totalhousing.GarageYrBlt != totalhousing.GarageYrBlt, totalhousing.YearBuilt, totalhousing.GarageYrBlt)
# Example
# a = np.array([2,2,0,6,8])
# b = np.array([np.nan,3,np.nan,4,5])
# c = np.array([0,0,0,0,0])
# d = np.array([1,1,1,1,1])
# np.where(np.isnan(b), a, b)



In [None]:
totalhousing['HasGarage'] = np.where(np.isnan(totalhousing['GarageYrBlt']), 0, 1)



In [None]:
#some of garage year built are earlier than year built, causing negative numbers, should come back and impute
totalhousing['GarageYearDiff'] = totalhousing['GarageYrBlt'] - totalhousing['YearBuilt']
# totalhousing['GarageYearDiff'] = [0 if math.isnan(i) else i for i in totalhousing['GarageYearDiff']]
totalhousing['GarageYearDiff'] = np.where(np.isnan(totalhousing['GarageYearDiff']), 0, totalhousing['GarageYearDiff'])
totalhousing['GarageYearDiff'] = np.where(totalhousing['GarageYearDiff'] < 0, 0, totalhousing['GarageYearDiff'])
totalhousing['GarageYearDiff'] = np.where(totalhousing['GarageYearDiff'] == 201, 1, totalhousing['GarageYearDiff'])
pd.unique(totalhousing['GarageYearDiff'])



In [None]:
#Visualize garage year imputation
plt.scatter(totalhousing['YearBuilt'], totalhousing['GarageYearDiff'])
plt.xlabel('Year Built')
plt.ylabel('Garage Year Built Difference')
matplotlib.rcParams['figure.figsize'] = (10, 10)



## 2.3 Year Remodeling Imputation

In [None]:
#create a column to check if the remodeled year is equal to year built
totalhousing['IsRemod'] = np.where(totalhousing['YearRemodAdd'] == totalhousing['YearBuilt'], 0, 1)


In [None]:
#create a column to store the year difference between every remodelling year and year built
totalhousing['RemodYearDiff'] = totalhousing['YearRemodAdd'] - totalhousing['YearBuilt']
totalhousing['RemodYearDiff'] = np.where(totalhousing['RemodYearDiff'] < 0, 0, totalhousing['RemodYearDiff'])
pd.unique(totalhousing['RemodYearDiff'])

In [None]:
#Visualize year remodeling imputation
plt.scatter(totalhousing['YearBuilt'], totalhousing['RemodYearDiff'])
plt.xlabel('Year Built')
plt.ylabel('Remodelling Year Difference')
matplotlib.rcParams['figure.figsize'] = (10, 10)

## 2.4 Ordinal Variable Imputation

In [None]:
ordinal1 = ["FireplaceQu", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "Fence", 'PoolQC']

In [None]:
totalhousing[ordinal1] = totalhousing[ordinal1] = totalhousing[ordinal1].replace(np.nan, 'None')

In [None]:
ordinal2 = ["LotShape","LandContour", "LandSlope", "BldgType", "OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", 
           "BsmtFinType2", "HeatingQC", "CentralAir", "KitchenQual", "Functional"]


In [None]:
for i in ordinal2:
    totalhousing[i] = totalhousing[i].fillna(totalhousing[i].mode()[0])

In [None]:
totalhousing[ordinal2].isnull().sum()

## 2.5 Nominal Variable Imputation

In [None]:
nominal1 = ['MiscFeature', 'Alley', 'GarageType', 'MasVnrType']


In [None]:
totalhousing[nominal] = totalhousing[nominal].replace(np.nan, 'None')

In [None]:
nominal2 = ['MSZoning', 'Utilities', 'SaleType', 'Exterior1st', 'Exterior2nd', 'Electrical']

In [None]:
for i in nominal2:
    totalhousing[i] = totalhousing[i].fillna(totalhousing[i].mode()[0])

In [None]:
totalhousing[nominal2].isnull().sum()

## 2.6 Continuous Variable Imputation

In [None]:
continuous1 = ['LotFrontage', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1', 'TotalBsmtSF', 'GarageArea', 'GarageCars'] 

In [None]:
totalhousing[continuous] = totalhousing[continuous].fillna(totalhousing[continuous].median())

## 2.7 Zero Variance and Near-Zero Variance Imputation

In [None]:
zerovariance = ['Street', 'Utilities', 'Condition2', 'RoofMatl', 'LowQualFinSF', '3SsnPorch', 'PoolArea', 'PoolQC', 'MiscVal']


In [None]:
totalhousing.columns

In [None]:
# drop all zero variance and near-zero variance columns
totalhousing = totalhousing.drop(zerovariance, axis = 1)


In [None]:
#drop yearremodadd and garageyrblt
totalhousing = totalhousing.drop(['YearRemodAdd', 'GarageYrBlt'], axis = 1)

In [None]:
totalhousing.shape

In [None]:
totalhousing.isnull().sum().sort_values(ascending = False)

## 2.1 Kaggle Train Test Split

In [None]:
housingtrainx = totalhousing.iloc[0:1460, :]

In [None]:
housingtrain = pd.concat([housingtrainx, housingtrainy], axis = 1)

In [None]:
housingtest = totalhousing.iloc[1460:2919, :].copy()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
housingtrain.to_csv('./Datasets/train_Wenchang.csv')

In [None]:
housingtest.to_csv('./Datasets/test_Wenchang.csv')

## 2.2 Statistical Analysis & Visualization

In [None]:
continuous2 = ["LotFrontage", "LotArea", "YearBuilt", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", 
               "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "ScreenPorch", 'MoSold', 'YrSold', 'SalePrice']


In [None]:
#create a correlation matrix with all numeric columns: Pearson

sns.set(style="ticks")

# take the numerical columns
data1 = pd.DataFrame(data= housingtrain[continuous2],
                 columns = housingtrain[continuous2].columns.tolist())

# Compute the correlation matrix
corr1 = data1.corr(method = 'pearson')

# Generate a mask for the upper triangle
mask = np.zeros_like(corr1, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(50, 100, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr1, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})




In [None]:
#top10 correlated continuous variables
correlationchart = corr1['SalePrice'].sort_values(ascending = False)
correlationchart.plot.barh()



In [None]:
# ordinal2 = ["LotShape", "LandContour", "LandSlope", "BldgType", 
#             "OverallQual", "OverallCond", "ExterQual", "ExterCond", 
#            "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", 
#            "BsmtFinType2", "HeatingQC", "CentralAir", "KitchenQual", 
#            "Functional", "FireplaceQu", "GarageFinish", "GarageQual", 
#            "GarageCond", "PavedDrive", "Fence"]

In [None]:
# ordinaldf = housingtrain[ordinal2].copy()


In [None]:
# #Ordinal Variables Correlation Plot: Kendall
# sns.set(style="ticks")

# # take the numerical columns
# data2 = pd.DataFrame(data= housingtrain[ordinal2],
#                  columns=housingtrain[ordinal2].columns.tolist())

# # Compute the correlation matrix
# corr2 = data2.corr(method = 'kendall')

# # Generate a mask for the upper triangle
# mask = np.zeros_like(corr2, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(11, 9))

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(75, 10, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr2, mask=mask, cmap=cmap, vmax=1, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})



In [None]:
# corr = data.corr(method = 'kendall')
# corr

In [None]:
# #Correlation between Ordinal Variables and Sales Prices: Spearman
# sns.set(style="ticks")

# # take the numerical columns
# data = pd.DataFrame(data= numericdf,
#                  columns=numericdf.columns.tolist())

# # Compute the correlation matrix
# corr = data.corr(method = 'spearman')

# # Generate a mask for the upper triangle
# mask = np.zeros_like(corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(11, 9))

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(100, 125, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})



In [None]:
housingtrain.describe()

In [None]:
#Boxplot with Median Price and Neighborhoods



N = len(pd.unique(housingtrain.Neighborhood))     # Number of boxes

# generate an array of rainbow colors by fixing the saturation and lightness of the HSL representation of colour 
# and marching around the hue. 

c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

#loading the data with filters
data = [{
    'y': housingtrain.SalePrice[housingtrain.Neighborhood == j],
    'name': j,
    'type':'box',
    'marker':{'color': c[i]}
    } for i, j in enumerate(pd.unique(housingtrain.Neighborhood))]


# format the layout
layout = go.Layout(title="Housing Sale Price by Neighborhoods",
                xaxis=dict(title='Neighborhood'),
                yaxis=dict(title='Housing Sale Price'))


fig = go.Figure(data=data, layout=layout)

py.offline.iplot(fig)


In [None]:
#Boxplot with Price and Overall Quality

N = len(pd.unique(housingtrain.OverallQual))     # Number of boxes

# generate an array of rainbow colors by fixing the saturation and lightness of the HSL representation of colour 
# and marching around the hue. 

c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

#loading the data with filters
data = [{
    'y': housingtrain.SalePrice[housingtrain.OverallQual == j],
    'name': j,
    'type':'box',
    'marker':{'color': c[i]}
    } for i, j in enumerate(pd.unique(housingtrain.OverallQual))]


# format the layout
layout = go.Layout(title="Housing Sale Price by Overall Quality",
                xaxis=dict(title='Overall Quality'),
                yaxis=dict(title='Housing Sale Price'))


fig = go.Figure(data=data, layout=layout)

py.offline.iplot(fig)


In [None]:
from datetime import datetime
saletime = housingtrain.copy()
saletime['Saletime'] = saletime['YrSold'].map(str) + '/' + saletime['MoSold'].map(str) 
saletime['Saletime'] = pd.to_datetime(saletime['Saletime'], format = '%Y/%m')
saletimedf = pd.concat([saletime['Saletime'], saletime['SalePrice']], axis = 1)
saletimedf = saletimedf.sort_values(by = 'Saletime')
saletimedf.Saletime = saletimedf.Saletime.map(lambda x: x.strftime('%Y/%m'))
pd.unique(saletimedf.Saletime)

In [None]:
#Boxplot with Price and Time

N = len(pd.unique(saletimedf.Saletime))     # Number of boxes
enumeratelist = pd.unique(saletimedf.Saletime)


# generate an array of rainbow colors by fixing the saturation and lightness of the HSL representation of colour 
# and marching around the hue. 

c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

#loading the data with filters
data = [{
    'y': saletimedf.SalePrice[saletimedf.Saletime == j],
    'name': j,
    'type':'box',
    'marker':{'color': c[i]}
    } for i, j in enumerate(enumeratelist)]


# format the layout
layout = go.Layout(title="Housing Sale Price by Year",
                xaxis=dict(title='Time'),
                yaxis=dict(title='Housing Sale Price'))


fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)




In [None]:
# #Median Price with Time
# saletimegroup = saletimedf.groupby('Saletime').median()
# saletimegroup.sort(by = 'Saletime')
# sorted(pd.unique(saletime.dt.date))

## 2.3 Feature Engineering



In [None]:
housingtrain.HalfBath = np.where(housingtrain.HalfBath == 0, np.median(housingtrain.HalfBath), housingtrain.HalfBath)
housingtrain.FullBath = np.where(housingtrain.FullBath == 0, np.median(housingtrain.FullBath), housingtrain.FullBath)
housingtrain.BedroomAbvGr = np.where(housingtrain.BedroomAbvGr == 0, np.median(housingtrain.BedroomAbvGr), housingtrain.BedroomAbvGr)



In [None]:
# bathroomratio = bath/room
housingtrain['Bath_Capacitance'] = (housingtrain['FullBath']+housingtrain['HalfBath']) /housingtrain['BedroomAbvGr']


In [None]:
#garageroomratio = garage/room
housingtrain['Parking_Capacitance']=housingtrain.GarageCars/housingtrain.BedroomAbvGr


In [None]:
housingtrain.columns

In [None]:
#landscape = lotshape * land contour
# housingtrain['landscape']


In [None]:
#saletype score



In [None]:
#neighborhood score



## Preprocessing

## 3.1 Encoding

In [None]:
#we create a private train set and a private test set within the kaggle train set in order to
#find best regularization parameter only based on private train set. 
privateset = housingtrain.copy()

In [None]:
privateset = pd.get_dummies(privateset, drop_first=True, dummy_na=True)

privateset.head(10)


## 3.2 Private Train Test Split

In [None]:
import sklearn.model_selection as ms

x_train, x_test, y_train, y_test = ms.train_test_split(privateset.loc[:, privateset.columns != 'SalePrice'], privateset['SalePrice'], 
                                                       test_size=1/5, random_state=0)

print('Original: {}, {}'.format(privateset.shape, privateset.shape))
print('Training: {}, {}'.format(x_train.shape, y_train.shape))
print('Test: {}, {}'.format(x_test.shape, y_test.shape))




## 3.2 Normalizing and Scaling

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline

In [None]:
# x_train = preprocessing.RobustScaler().fit(x_train)
# x_test = preprocessing.RobustScaler().fit(x_test)
# x_train = preprocessing.RobustScaler(x_train)
# x_test = preprocessing.RobustScaler(x_test)

In [None]:
#log transform the sale prices
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [None]:
scaler = preprocessing.StandardScaler()

In [None]:
#standardize and normalize the features
# x_train = pd.DataFrame(data = scaler.fit_transform(x_train), columns = x_train.columns)
# x_test = pd.DataFrame(data = scaler.fit_transform(x_test), columns = x_test.columns)

In [None]:
from scipy.stats import norm, skew
numeric_feats = house_train_test_log.dtypes[house_train_test_log.dtypes != "object"].index
# Check the skew of all numerical features
skewed_feats = house_train_test_log[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
house_train_test_log[skewed_features] = np.log1p(house_train_test_log[skewed_features]



In [None]:
# kaggleset = privateset.copy()

In [None]:
# public_x = kaggleset.loc[:, kaggleset.columns != 'SalePrice']
# public_y = kaggleset.SalePrice

In [None]:
# public_x = pd.DataFrame(data = scaler.fit_transform(public_x), columns = public_train.columns)
# public_y = np.log1p(public_y)

## Modelling

## 4.1 Regularized Linear Regression: Ridge 

In [None]:
def rmse_cv(model):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train)
    rmse= np.sqrt(-cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

def rmse(residuals):
    return np.sqrt(np.sum(residuals**2)/(len(residuals)))

In [None]:
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV
ridge = linear_model.Ridge()

In [None]:
# alpha_100 = np.logspace(-2, 5, 100)
# ridge_grid_search = GridSearchCV(estimator = ridge, param_grid = {'alpha': alpha_100}, cv = 5, verbose = 2, scoring = 'neg_mean_squared_error')
# ridge_fit = ridge_grid_search.fit(x_train, y_train)
# best_param = ridge_grid_search.best_params_
# score = np.sqrt(-ridge_grid_search.best_score_)
# ran = ridge_grid_search.best_estimator_
# error = rmse(ran.predict(x_test)-y_test)

In [None]:
alpha_100 = np.logspace(-2, 5, 100)
coef = []

for i in alpha_100:
    ridge.set_params(alpha = i)
    ridge.fit(x_train, y_train)
    coef.append(ridge.coef_)

columns = x_train.columns
df_coef = pd.DataFrame(coef, index=alpha_100, columns=columns)
title = 'Ridge coefficients as a function of the regularization'
df_coef.plot(logx=True, title=title, legend = False)

plt.xlabel('alpha')
plt.ylabel('coefficients')
plt.show()

In [None]:
cv_Ridge = [rmse_cv(Ridge(alpha = Alpha)).mean() for Alpha in alpha_100]

In [None]:
cv_Ridge = pd.Series(cv_Ridge, index = alpha_100)

fig, ax = plt.subplots(figsize=(12,12));
ax.plot(alpha_100,cv_Ridge);
ax.set_xlabel('alpha')
#ax.set_ylabel('RMSE')
alpha0 = cv_Ridge[cv_Ridge == cv_Ridge.min()].index[0];
rmse0 = cv_Ridge.min();
s = 'Minumum RMSE {:.4f} \nachieved at alpha = {:.4f}'.format(rmse0,alpha0)
ax.annotate(s, xy=(alpha0, rmse0),
               xycoords='data',
               xytext=(alpha_100.mean(), cv_Ridge.mean()),
               #textcoords='offset points',
               bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3),
               arrowprops=dict(arrowstyle='->',color='red'))

print('*'*50)
print('Ridge CV:')
print(s)
print('*'*50)

In [None]:
# ridgemodel = make_pipeline(preprocessing.StandardScaler(), Ridge(alpha =0.0005, random_state=1))

In [None]:
# score = rmse_cv(ridgemodel)
# print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
model_Ridge = Ridge(alpha0).fit(x_train, y_train);

coeffs_Ridge = pd.Series(model_Ridge.coef_, index = public_x.columns);
vip_coeffs_Ridge = pd.concat([coeffs_Ridge.sort_values().head(10),
                              coeffs_Ridge.sort_values().tail(10)])

fig, ax = plt.subplots(figsize=(8,8));
vip_coeffs_Ridge.plot(kind = "barh");
plt.title("VIP Coefficients (Ridge)");

In [None]:
predictions_Ridge = pd.DataFrame({"Predicted":model_Ridge.predict(x_test), 
                                  "Actual":y_test});
predictions_Ridge["Residual"] = predictions_Ridge.Actual - predictions_Ridge.Predicted;


print('*'*50)
print('Ridge Performace: R^2 = {:.4f}'.format(model_Ridge.score(x_test, y_test)))
print('*'*50)
print('Lasso Performace: RMSE = {:.4f}'.format(rmse(predictions_Ridge.Residual)))
print('*'*50)


fig, ax = plt.subplots(figsize=(8,8));
sns.regplot(predictions_Ridge.Actual, predictions_Ridge.Predicted);
#ax.set_title('')

## 4.2 Regularized Linear Regression: Lasso

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso()

In [None]:
alpha_100 = np.logspace(-4, 1, 100)
coef_lasso = []


for i in alpha_100:
    lasso.set_params(alpha = i)
    lasso.fit(x_train, y_train)
    coef_lasso.append(lasso.coef_)


columns = x_train.columns
df_coef = pd.DataFrame(coef_lasso, index=alpha_100, columns=columns)
title = 'Lasso coefficients as a function of the regularization'
df_coef.plot(logx=True, title=title, legend = False)
plt.xlabel('alpha')
plt.ylabel('coefficients')
plt.show()



In [None]:
cv_Lasso = [rmse_cv(linear_model.Lasso(alpha = Alpha)).mean() for Alpha in alpha_100]

In [None]:
cv_Lasso = pd.Series(cv_Lasso, index = alpha_100)

fig, ax = plt.subplots(figsize=(8,8));
ax.plot(alpha_100,cv_Lasso);

alpha0 = cv_Lasso[cv_Lasso == cv_Lasso.min()].index[0];
rmse0 = cv_Lasso.min();
s = 'Minumum RMSE {:.4f} \nachieved at alpha = {}'.format(rmse0,alpha0)
ax.annotate(s, xy=(alpha0, rmse0),
               xycoords='data',
               xytext=(alpha_100.mean()/2, cv_Lasso.mean()/2),
               #textcoords='offset points',
               bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3),
               arrowprops=dict(arrowstyle='->',color='red'))


print('*'*50)
print('Lasso CV:')
print(s)
print('*'*50)

In [None]:
# ridgemodel = make_pipeline(preprocessing.StandardScaler(), Ridge(alpha =0.0005, random_state=1))

In [None]:
# score = rmse_cv(ridgemodel)
# print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
model_Lasso = Lasso(alpha0).fit(x_train, y_train);


coeffs_Lasso = pd.Series(model_Lasso.coef_, index = public_x.columns);
vip_coeffs_Lasso = pd.concat([coeffs_Lasso.sort_values().head(10),
                              coeffs_Lasso.sort_values().tail(10)])

fig, ax = plt.subplots(figsize=(8,8));
vip_coeffs_Lasso.plot(kind = "barh");
plt.title("VIP Coefficients (Lasso)");

In [None]:
predictions_Lasso = pd.DataFrame({"Predicted":model_Lasso.predict(x_test), 
                                  "Actual":y_test});
predictions_Lasso["Residual"] = predictions_Lasso.Actual - predictions_Lasso.Predicted;


print('*'*50)
print('Lasso Performace: R^2 = {:.4f}'.format(model_Lasso.score(x_test, y_test)))
print('*'*50)
print('Lasso Performace: RMSE = {:.4f}'.format(rmse(predictions_Lasso.Residual)))
print('*'*50)


fig, ax = plt.subplots(figsize=(8,8));
sns.regplot(predictions_Lasso.Actual, predictions_Lasso.Predicted);
#ax.set_title('')

## 4.3 Regularized Linear Modeling: Elastic Net

In [None]:
from sklearn import linear_model
elastic = linear_model.ElasticNet(alpha = 1)

In [None]:
alpha_100 = np.logspace(-2, 5, 100)
coef = []

for i in alpha_100:
    ridge.set_params(alpha = i)
    elastic.fit(x_train, y_train)
    coef.append(elastic.coef_)

columns = x_train.columns
df_coef = pd.DataFrame(coef, index=alpha_100, columns=columns)
title = 'Ridge coefficients as a function of the regularization'
df_coef.plot(logx=True, title=title, legend = False)

plt.xlabel('alpha')
plt.ylabel('coefficients')
plt.show()

In [None]:
cv_Ridge = [rmse_cv(Ridge(alpha = Alpha)).mean() for Alpha in alpha_100]

In [None]:
cv_Ridge = pd.Series(cv_Ridge, index = alpha_100)

fig, ax = plt.subplots(figsize=(12,12));
ax.plot(alpha_100,cv_Ridge);
ax.set_xlabel('alpha')
#ax.set_ylabel('RMSE')
alpha0 = cv_Ridge[cv_Ridge == cv_Ridge.min()].index[0];
rmse0 = cv_Ridge.min();
s = 'Minumum RMSE {:.4f} \nachieved at alpha = {:.4f}'.format(rmse0,alpha0)
ax.annotate(s, xy=(alpha0, rmse0),
               xycoords='data',
               xytext=(alpha_100.mean(), cv_Ridge.mean()),
               #textcoords='offset points',
               bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3),
               arrowprops=dict(arrowstyle='->',color='red'))

print('*'*50)
print('Ridge CV:')
print(s)
print('*'*50)

In [None]:
# ridgemodel = make_pipeline(preprocessing.StandardScaler(), Ridge(alpha =0.0005, random_state=1))

In [None]:
# score = rmse_cv(ridgemodel)
# print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
model_Ridge = Ridge(alpha0).fit(public_x, public_y);

coeffs_Ridge = pd.Series(model_Ridge.coef_, index = public_x.columns);
vip_coeffs_Ridge = pd.concat([coeffs_Ridge.sort_values().head(10),
                              coeffs_Ridge.sort_values().tail(10)])

fig, ax = plt.subplots(figsize=(8,8));
vip_coeffs_Ridge.plot(kind = "barh");
plt.title("VIP Coefficients (Ridge)");

In [None]:
predictions_Ridge = pd.DataFrame({"Predicted":model_Ridge.predict(public_x), 
                                  "Actual":public_y});
predictions_Ridge["Residual"] = predictions_Ridge.Actual - predictions_Ridge.Predicted;


print('*'*50)
print('Ridge Performace: R^2 = {:.4f}'.format(model_Ridge.score(public_x, public_y)))
print('*'*50)
print('Lasso Performace: RMSE = {:.4f}'.format(rmse(predictions_Ridge.Residual)))
print('*'*50)


fig, ax = plt.subplots(figsize=(8,8));
sns.regplot(predictions_Ridge.Actual, predictions_Ridge.Predicted);
#ax.set_title('')

## “Why did you choose your methodology?”

## “How was this implemented?”

## “Why does this work?”

## “How did you verify results?”

## “Why is this method superior to what is used in the industry?”