In [224]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection as ms
import sklearn.feature_selection as fs
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [179]:
data_train = pd.read_csv('../data/train_selected_features.csv')
data_test = pd.read_csv('../data/test_selected_features.csv')

In [52]:
factor_type = data_train.FireplaceQu.dtype
factor_type

dtype('O')

In [180]:
cols_to_dummy = []
for column in data_train.columns:
    if data_train[column].dtype == factor_type:
        cols_to_dummy.append(column)

In [130]:
def dummify_frame(df,columns, one_hot = True):
    dummies = pd.get_dummies(df[columns],drop_first=one_hot)
    df = pd.concat([df,dummies],axis=1).drop(columns,axis=1)
    return df    

In [181]:
data_train = dummify_frame(data_train,cols_to_dummy)

In [182]:
data_test = dummify_frame(data_test,cols_to_dummy)

In [203]:
y = data_train[['SalePrice']]
y = np.array(y)
x = data_train.drop(['SalePrice','Id'],axis=1)

In [206]:
y= y.reshape(len(y),)

In [144]:
x_train, x_test, y_train, y_test = ms.train_test_split(x, y, test_size = 0.2, random_state = 0)

In [150]:
alphas_elastic = np.logspace(-2, 4, 1000)
coef_elastic = {}

In [154]:
for i in alphas_elastic:
    elastic = ElasticNet(l1_ratio=0.5)
    elastic.set_params(alpha=i)
    elastic.fit(x_train,y_train)
    pred = elastic.predict(x_test)
    coef_elastic[i] = mean_squared_error(pred,y_test)

In [158]:
[key for key,value in coef_elastic.items() if value == min(coef_elastic.values())]

[0.01]

In [242]:
elastic = ElasticNet()
elastic.set_params(alpha=0.01,l1_ratio=0.1,normalize=True)
elastic.fit(x_train,y_train)
pred = elastic.predict(x_test)
mean_squared_error(pred,y_test)

0.06424659387663208

In [235]:
grid_param = [{'alpha':np.logspace(-2,4,1000)}]

para_search = GridSearchCV(estimator = elastic, param_grid = grid_param, cv = 5, return_train_score = True)
para_search = para_search.fit(x_train,y_train)

In [243]:
print(para_search.best_score_)
print(para_search.best_params_)

0.22796135813270518
{'alpha': 0.01}


In [244]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

In [245]:
clf = LassoCV(cv=5)
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(x, y)
n_features = sfm.transform(x).shape[1]



In [None]:
while n_features < 30:
    sfm.threshold += 0.1
    X_transform = sfm.transform(x)
    n_features = X_transform.shape[1]

