###Formalities

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#Housing prices data with few columns for convinience
import pandas as pd
dataset = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data/housing_trim.csv")
dataset.dropna(inplace = True)
dataset.shape

(1201, 8)

##Data Manipulation

In [0]:
dataset.dtypes

MSZoning        object
LotFrontage    float64
LotArea          int64
HouseStyle      object
OverallQual      int64
OverallCond      int64
YearBuilt        int64
SalePrice        int64
dtype: object

In [0]:
#To convert into object (Categorical)
dataset['OverallQual'] = dataset['OverallQual'].astype(object)
dataset['OverallCond'] = dataset['OverallCond'].astype(object)


In [0]:
X = dataset.drop(['SalePrice'],axis=1)
Y = dataset['SalePrice']

In [0]:
#Updating date fields
X['Age'] = 2010 - X['YearBuilt']
X.drop(columns = ['YearBuilt'], inplace = True)

In [0]:
# # label encoding snippet 
# # label encoding will assign numerical values to each object value based on alphabetical order
# # eg. A will be encoded as 0, B - 1, C- 2
# # Helpful for High Cardinality Data - LightGBM can handle integer categorical fields

# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# le.fit(X['HouseStyle'])
# X['HouseStyle_LE'] = le.transform(X['HouseStyle'])

In [0]:
# # one shot one hot encoding ---- drop_first = False will do onehotencoding, true will do dummy
# pd.get_dummies(x['OverallCond'], prefix='OverallCond', drop_first=True)

In [0]:
X = pd.get_dummies(X, drop_first=True)

In [0]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
# y = np.log1p(y)

In [0]:
#Train test
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error


X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.20, random_state=0)

##Linear Regression

In [0]:
lm = LinearRegression()
lm.fit(X_train, Y_train)

# print intercept and coefficients
print('Intercept: ', lm.intercept_)
print('Coef: ', lm.coef_)

Intercept:  175535.5267632979
Coef:  [ 1.89056259e+02  1.22083694e+00 -4.98827868e+02  1.39429389e+04
  2.51118631e+04  3.16682577e+04  1.59117196e+04 -2.83715702e+04
 -1.24845163e+04  4.44851712e+04 -6.71037752e+03  5.29044086e+03
 -2.04842268e+04 -1.10352398e+04 -1.03405961e+05 -9.23140063e+04
 -7.88787273e+04 -6.55917791e+04 -4.88842705e+04 -1.03720011e+04
  5.86701870e+04  1.46347197e+05  1.94429361e+05 -1.86067537e+04
 -2.63377083e+03 -3.75634702e+03  9.40726748e+03  1.51424333e+04
  1.92504741e+04  1.70380598e+04]


In [0]:
from sklearn.metrics import mean_absolute_error
predictions = lm.predict(X_test)

mae = mean_absolute_error(Y_test, predictions)
print('MAE: ', mae)

MAE:  22891.21031341776


In [0]:
# R-squared
from sklearn.metrics import r2_score
r2_score(Y_train, lm.predict(X_train))

0.7534974381697815

In [0]:
import statsmodels.api as sm

x_statsmodel = sm.add_constant(X_train)
results = sm.OLS(Y_train, x_statsmodel).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.746
Method:                 Least Squares   F-statistic:                     98.03
Date:                Sat, 04 Jan 2020   Prob (F-statistic):          1.90e-259
Time:                        06:18:41   Log-Likelihood:                -11596.
No. Observations:                 960   AIC:                         2.325e+04
Df Residuals:                     930   BIC:                         2.340e+04
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               1.58e+05   2.79e+0

  return ptp(axis=axis, out=out, **kwargs)


##Lasso, Ridge and Elastic Net

In [0]:
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

In [0]:
from sklearn import preprocessing

# Standardize data - center to the mean and component wise scale to unit variance.
# careful - Normalization usually means to scale a variable to have a values between 0 and 1, 
                # while standardization transforms data to have a mean of zero and a standard deviation of 1

# x_train_scaled = preprocessing.scale(X_train)
# x_test_scaled = preprocessing.scale(X_test)

# apply same scaling as of train to test - best practice
x_train_scaled = X_train
scaler = preprocessing.StandardScaler().fit(x_train_scaled[['LotFrontage','LotArea','Age']])
x_train_scaled[['LotFrontage','LotArea','Age']] = scaler.transform(x_train_scaled[['LotFrontage','LotArea','Age']])

x_test_scaled = X_test
x_test_scaled[['LotFrontage','LotArea','Age']] = scaler.transform(X_test[['LotFrontage','LotArea','Age']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the 

In [0]:
# normalization code
# pd.DataFrame(preprocessing.normalize(X_train, norm='l2'))

# min max scaling
# pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X_train)).describe()

In [0]:
from sklearn.metrics import r2_score
lassocv = LassoCV(cv=10)
lasso = lassocv.fit(x_train_scaled, Y_train)

print('R sq:', r2_score(Y_train, lasso.predict(x_train_scaled)))
print('Optimal Alpha:', lassocv.alpha_)
print(x_train_scaled.columns)
print(lasso.coef_)

R sq: 0.7528280293111718
Optimal Alpha: 45.70249120584148
Index(['LotFrontage', 'LotArea', 'Age', 'MSZoning_FV', 'MSZoning_RH',
       'MSZoning_RL', 'MSZoning_RM', 'HouseStyle_1.5Unf', 'HouseStyle_1Story',
       'HouseStyle_2.5Fin', 'HouseStyle_2.5Unf', 'HouseStyle_2Story',
       'HouseStyle_SFoyer', 'HouseStyle_SLvl', 'OverallQual_2',
       'OverallQual_3', 'OverallQual_4', 'OverallQual_5', 'OverallQual_6',
       'OverallQual_7', 'OverallQual_8', 'OverallQual_9', 'OverallQual_10',
       'OverallCond_3', 'OverallCond_4', 'OverallCond_5', 'OverallCond_6',
       'OverallCond_7', 'OverallCond_8', 'OverallCond_9'],
      dtype='object')
[  4910.37690828  10471.52625506 -15904.45126997      0.
   6209.31742575  17760.35894034   1953.07239749 -21705.37565517
 -11201.23426992  39843.72188388     -0.           6684.18671106
 -16856.14035793  -8572.06405794 -40265.70007874 -40111.58888886
 -30182.6400423  -16359.30810229      0.          37881.27701881
 106639.89146834 193053.63520978 23

In [0]:
from sklearn.metrics import mean_absolute_error
predictions = lasso.predict(x_test_scaled)

mae = mean_absolute_error(Y_test, predictions)
print('MAE: ', mae)

MAE:  22535.783617292018


In [0]:
ridgecv = RidgeCV(cv=10)
ridge = ridgecv.fit(x_train_scaled, Y_train)

print('R sq:', r2_score(Y_train, ridge.predict(x_train_scaled)))
print('Optimal Alpha:', ridgecv.alpha_)
print(ridge.coef_)

R sq: 0.7534809213226819
Optimal Alpha: 0.1
[  4844.0186096   10463.32675402 -15930.73103068  13583.85101246
  24518.77850642  31302.72194601  15603.74783162 -27949.0790405
 -12583.21550042  44137.10264532  -6133.5617329    5269.28968976
 -20533.54775033 -11139.80366616 -98638.46734925 -91651.75848768
 -79054.19825727 -65907.67631001 -49279.25244662 -10936.17606002
  57988.1554553  145240.29188727 192239.08156817 -20103.5839691
  -3966.01742964  -4889.51498383   8247.2812797   13994.96732651
  18178.78484411  16248.56223979]


In [0]:
from sklearn.metrics import mean_absolute_error
predictions = ridge.predict(x_test_scaled)

mae = mean_absolute_error(Y_test, predictions)
print('MAE: ', mae)

MAE:  22858.333482957572


In [0]:
elasticcv = ElasticNetCV(cv=10, l1_ratio = [.1, .5, .7, .9, .95, .99, 1])
elastic = elasticcv.fit(x_train_scaled, Y_train)

print('R sq:', r2_score(Y_test, elastic.predict(x_test_scaled)))
print('Optimal Alpha:', elasticcv.alpha_)
print(elastic.coef_)

R sq: 0.8112591931265845
Optimal Alpha: 45.70249120584148
[  4910.37690828  10471.52625506 -15904.45126997      0.
   6209.31742575  17760.35894034   1953.07239749 -21705.37565517
 -11201.23426992  39843.72188388     -0.           6684.18671106
 -16856.14035793  -8572.06405794 -40265.70007874 -40111.58888886
 -30182.6400423  -16359.30810229      0.          37881.27701881
 106639.89146834 193053.63520978 239677.94621486 -26177.54243042
 -11837.21865889 -12422.93645001      0.           5584.779749
   9607.78539852   6253.12383998]


In [0]:
from sklearn.metrics import mean_absolute_error
predictions = elastic.predict(x_test_scaled)

mae = mean_absolute_error(Y_test, predictions)
print('MAE: ', mae)

MAE:  22535.783617292018


##kNN Regressor

##Decision Tree

##Random Forest

##XGBoost

##LightGBM

##CatBoost