In [4]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from itertools import combinations
from scipy import linalg as la
from statsmodels.regression.linear_model import OLS
from scipy.optimize import minimize

In [2]:
df = pd.read_csv("wages.csv")
df.head()

Unnamed: 0,wage,educ,exper,tenure,nonwhite,female,married,numdep,smsa,northcen,...,trcommpu,trade,services,profserv,profocc,clerocc,servocc,lwage,expersq,tenursq
0,3.1,11,2,0,0,1,0,2,1,0,...,0,0,0,0,0,0,0,1.131402,4,0
1,3.24,12,22,2,0,1,1,3,1,0,...,0,0,1,0,0,0,1,1.175573,484,4
2,3.0,11,2,0,0,0,0,2,0,0,...,0,1,0,0,0,0,0,1.098612,4,0
3,6.0,8,44,28,0,0,1,0,1,0,...,0,0,0,0,0,1,0,1.791759,1936,784
4,5.3,12,7,2,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1.667707,49,4


In [3]:
def ridge(lmbda, X, Y):
    """Perform an OLS regression with L^2 regularization 
    (the ridge method)
    
    Parameters:
        lmbda (float): regularization parameter
        X ((n,d) ndarray): feature data
        Y ((n,1) ndarray): dependent var. data
    
    Returns:
        Bhat ((d,1) ndarray): minimizer of the regularized regression
    """
    U, S, Vh = la.svd(X, full_matrices=False)
    
    for i, s in enumerate(S):
        if s < 1e-5:
            S[i] = 0
        else:
            S[i] = 1/s
    
    return Vh.T @ np.diag(S) @ U.T @ Y

In [8]:
def myfunc(beta,x,y,lam):
    return np.linalg.norm(y-x@beta)**2 + lam*np.linalg.norm(beta)**2
def solution(x,y,lam):
    return minimize(myfunc,ridge(lam,x,y),args=(x,y,lam))['x']

X = df[['educ','tenure','female','nonwhite','numdep']]
Y = df['wage']
print(ridge(7,np.array(X),np.array(Y)))
solution(np.array(X),np.array(Y),7000)

[ 0.47588912  0.15785767 -1.92333271 -0.25481063  0.09551804]


array([ 0.37679127,  0.16958451, -0.02410083,  0.00056444,  0.02483014])

In [9]:
df['female*married'] = df['female']*df['married']
df['intercept']=1

In [11]:
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite', 'intercept']]
Y = df["wage"]

for k in range(-5, 6):
    print("k =",k)
    lam = 10**k
    #(i)
    Bhat1 = solution(np.array(X), np.array(Y), lam)
    model = sm.OLS(Y, X)
    Bhat2 = OLS.fit_regularized(model, lam, L1_wt=0).params
    Bhat3 = linear_model.Ridge(lam, fit_intercept=False).fit(X,Y).coef_
    print("Bhat1:", Bhat1)
    print("Bhat2:", Bhat2)
    print("Bhat3:", Bhat3)

k = -5
Bhat1: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat2: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat3: [-0.34413616  0.56308895  0.02091294  0.12976129  1.7355772  -2.35781434
  0.08909384 -0.21423432 -2.56638269]
k = -4
Bhat1: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat2: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat3: [-0.34414422  0.56308805  0.02091285  0.12976135  1.73556986 -2.35780296
  0.08909341 -0.2142345  -2.5663638 ]
k = -3
Bhat1: [-0.34422472  0.56307891  0.02091189  0.12976205  1.73549677 -2.35768949
  0.08908894 -0.2142365  -2.56617238]
Bhat2: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat3: [-0.34422474  0.56307907  0.02091

My solution is identical to the statsmodel package, but overtime diverges from ours. Ours then looks close to the scikitlearn solution

In [12]:
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite']]
Y = df["wage"]
print("""
Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']
""")
print("""
Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']
""")
for k in range(-5, 6):
    print("k =", k)
    lam = 10**k
    model = sm.OLS(Y, X)
    B1 = OLS.fit_regularized(model,alpha=lam, L1_wt=1).params
    print(B1[B1!=0].index)


Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']


Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']

k = -5
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -4
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -3
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -2
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -1
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married'], dtype='object')
k = 0
Index(['educ', 'exper', 'tenure'], dtype='object')
k = 1
Index(['educ', 'exper'], dtype='object')
k = 2
Index(['exper'], dtype='object')
k = 3
Index([], dtype='object')
k = 4
Index([], dtype='obje

In [13]:
print("""
Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']
""")
print("""
Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']
""")
features = np.array(['female','educ','exper','tenure','married','female*married','numdep','nonwhite'])
for k in range(-5, 6):
    print("k =", k)
    lmbda = 10**k
    model = sm.OLS(Y, X)
    Beta = linear_model.Lasso(lmbda).fit(X,Y).coef_
    print(features[Beta!=0])


Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']


Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']

k = -5
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -4
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -3
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -2
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -1
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep']
k = 0
['educ' 'exper' 'tenure']
k = 1
[]
k = 2
[]
k = 3
[]
k = 4
[]
k = 5
[]


In [15]:
Y = df["wage"]
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite']]
model = linear_model.LinearRegression()
print(np.mean(cross_val_score(model, X,Y,cv=7)))

0.34555318816486785


In [16]:
X = df[['educ','tenure','married','female*married']]
model = linear_model.LinearRegression()
print(np.mean(cross_val_score(model, X,Y,cv=7)))

0.3536916310757054


In [17]:
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite']]
for k in range(-5, 6):
    print("k =", k)
    lmbda = 10**k
    model1 = linear_model.Ridge(lmbda)
    model2 = linear_model.Lasso(lmbda)
    print("Ridge:", np.mean(cross_val_score(model1, X,Y,cv=7)))
    print("Lasso:", np.mean(cross_val_score(model2, X,Y,cv=7)))

k = -5
Ridge: 0.34555319365442216
Lasso: 0.3455542285146271
k = -4
Ridge: 0.3455532430594165
Lasso: 0.3455635511259626
k = -3
Ridge: 0.345553737010782
Lasso: 0.3456537479432414
k = -2
Ridge: 0.3455586666782383
Lasso: 0.34632375845219227
k = -1
Ridge: 0.34560699001806483
Lasso: 0.3324959767115911
k = 0
Ridge: 0.346003143144572
Lasso: 0.2618491822364122
k = 1
Ridge: 0.34622037502546477
Lasso: -0.03370662447048863
k = 2
Ridge: 0.3318565137245385
Lasso: -0.03370662447048863
k = 3
Ridge: 0.28345247109438054
Lasso: -0.03370662447048863
k = 4
Ridge: 0.1631456855278
Lasso: -0.03370662447048863
k = 5
Ridge: 0.022365325829704186
Lasso: -0.03370662447048863


In [44]:
df = pd.read_csv("allRealEstateData.csv",sep='\t')
df = df.drop('Unnamed: 0',axis=1)
df=df.dropna(how='any')
df.describe()

Unnamed: 0,Bedrooms,Bathrooms,YearBuilt,DaysOnMarket,Zipcode,SoldPrice,ElementarySchoolRating
count,29781.0,29781.0,29781.0,29781.0,29781.0,29781.0,29781.0
mean,3.456768,12.950069,1985.543098,431.430274,82200.715691,438181.7,5.414257
std,0.999052,23.263666,27.386535,388.885778,2778.781899,401411.4,2.002721
min,1.0,1.0,1868.0,2.0,58645.0,3000.0,1.0
25%,3.0,2.0,1972.0,141.0,80221.0,235000.0,4.0
50%,3.0,3.0,1994.0,285.0,80550.0,355000.0,5.0
75%,4.0,5.0,2005.0,601.0,85173.0,510000.0,7.0
max,20.0,75.0,2018.0,2086.0,89415.0,7500000.0,10.0


In [45]:
Y = df["SoldPrice"]
X = df[['Bedrooms','Bathrooms','YearBuilt','DaysOnMarket','Zipcode','ElementarySchoolRating']]
model = linear_model.LinearRegression()
print(np.mean(cross_val_score(model, X,Y,cv=7)))

0.2267713884062917


In [46]:
X = df[['YearBuilt','DaysOnMarket','Zipcode','ElementarySchoolRating']]
model = linear_model.LinearRegression()
print(np.mean(cross_val_score(model, X,Y,cv=7)))

0.13679804853618724


In [47]:
X = df[['Bedrooms','Bathrooms','YearBuilt','DaysOnMarket','Zipcode','ElementarySchoolRating']]
for k in range(-5, 6):
    print("k =", k)
    lmbda = 10**k
    model1 = linear_model.Ridge(lmbda)
    model2 = linear_model.Lasso(lmbda)
    print("Ridge:", np.mean(cross_val_score(model1, X,Y,cv=7)))
    print("Lasso:", np.mean(cross_val_score(model2, X,Y,cv=7)))

k = -5
Ridge: 0.22677138842834574
Lasso: 0.22677138841092367
k = -4
Ridge: 0.22677138862683513
Lasso: 0.22677138845244565
k = -3
Ridge: 0.22677139061172635
Lasso: 0.2267713888671254
k = -2
Ridge: 0.2267714104606094
Lasso: 0.2267713930248624
k = -1
Ridge: 0.22677160894640563
Lasso: 0.22677143460441504
k = 0
Ridge: 0.2267735935009336
Lasso: 0.22677185038312037
k = 1
Ridge: 0.2267934087259465
Lasso: 0.2267760214725582
k = 2
Ridge: 0.22698855197569792
Lasso: 0.22681740072597104
k = 3
Ridge: 0.22866070700142535
Lasso: 0.22722305889207378
k = 4
Ridge: 0.23034508418716187
Lasso: 0.23036890542266544
k = 5
Ridge: 0.16656424082268892
Lasso: 0.17077464942331222
