In [None]:
import random
random.seed(42)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from copy import deepcopy
# for sqrt 
import math
from feature_selector import FeatureSelector

from sklearn.metrics import mean_squared_error as mse
import copy


#load datasets
test_id = list(np.arange(0, 2930,3))
data = pd.read_csv('Ames_data.csv')
train_id = list(set(np.arange(0, 2930))-set(test_id))

train = data.iloc[train_id,:]
test = data.iloc[test_id,:]

#extract train-test target variable
train_target = pd.DataFrame(np.log(train['Sale_Price']))
test_target = pd.DataFrame(np.log(test['Sale_Price']))

#drop the tatget variable from train-test
train = train.drop(columns=['Sale_Price','MS_Zoning','Street','Utilities','Land_Slope','Condition_2', 
                            'Roof_Matl', 'Heating', 'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 
                            'Three_season_porch', 'Pool_Area', 'Misc_Val', 'Longitude','Latitude',
                           'Alley','Bsmt_Cond','Central_Air','Electrical','Functional','Garage_Qual'])
test = test.drop(columns=['Sale_Price','MS_Zoning','Street','Utilities','Land_Slope','Condition_2', 
                            'Roof_Matl', 'Heating', 'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 
                            'Three_season_porch', 'Pool_Area', 'Misc_Val', 'Longitude','Latitude',
                         'Alley','Bsmt_Cond','Central_Air','Electrical','Functional','Garage_Qual'])



In [None]:
train['int_liv_lot_area'] = train['Gr_Liv_Area']*train['Lot_Area']
train['int_liv_total_bsmt'] = train['Gr_Liv_Area']*train['Total_Bsmt_SF']
train['int_live_garage_liv'] = train['Gr_Liv_Area']*train['Garage_Yr_Blt']
train['int_first_area_garage'] = train['First_Flr_SF']*train['Garage_Area']

test['int_liv_lot_area'] = test['Gr_Liv_Area']*test['Lot_Area']
test['int_liv_total_bsmt'] = test['Gr_Liv_Area']*test['Total_Bsmt_SF']
test['int_live_garage_liv'] = test['Gr_Liv_Area']*test['Garage_Yr_Blt']
test['int_first_area_garage'] = test['First_Flr_SF']*test['Garage_Area']

In [None]:
#dummy coding process
categorical_features = [col for col in train.columns if train[col].dtypes =='object']
train = pd.get_dummies(train,columns = categorical_features)
test = pd.get_dummies(test,columns = categorical_features)

#make sure train-test has same shape and columns
train_features,test_features = train.align(test,join = 'inner',axis=1)

In [None]:
fs = FeatureSelector(data = train_features, labels = train_target)

fs.identify_all(selection_params = {'missing_threshold': 0.8, 'correlation_threshold': 0.8, 
                                    'task': 'regression', 'eval_metric': 'l2', 
                                     'cumulative_importance': 0.95})


In [None]:
# list(train.columns)

In [None]:
# train['Lot_Area'].head(3)

In [None]:
fs.plot_feature_importances(threshold = 0.9)


In [None]:
# train_features = fs.remove('all')
# train_features,test_features =train_features.align(test_features,join = 'inner',axis=1)

In [None]:
test_features.shape

In [None]:
def rmse(true,predicted):
    return math.sqrt(mse(true,predicted))

In [None]:
#impute the data
from sklearn.preprocessing import StandardScaler,Imputer,PolynomialFeatures

im = Imputer(strategy = 'median')
im.fit(train_features)

train_features_np = im.transform(train_features)
test_features_np = im.transform(test_features)

print(np.where(~np.isfinite(train_features_np)))
print(np.where(~np.isfinite(test_features_np)))

#scale the data
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(train_features_np)
# Apply transform to both the training set and the test set.
train_features_np = scaler.transform(train_features_np)
test_features_np = scaler.transform(test_features_np)

# poly = PolynomialFeatures(2)
# poly.fit(train_features_np)
# # Apply transform to both the training set and the test set.
# train_features_np = poly.transform(train_features_np)
# test_features_np = poly.transform(test_features_np)


In [None]:

train_target = np.array(train_target).reshape((-1, 1))
train_target.shape

#scale the data
scalery = StandardScaler()
# Fit on training set only.
scalery.fit(train_target)

y_compute = scalery.transform(train_target)

In [None]:
y_compute = np.array(y_compute).reshape((-1, ))

In [None]:
y_compute.shape

In [None]:
def one_step_lasso(r, x, lam):
    xx = np.sum(np.square(x))
    xr = np.sum(np.dot(r,x))
    b = (np.abs(xr) -lam/2)/xx
    b = sign(xr)*ifelse(b>0, b, 0)
    return b

In [None]:
def sign(arg):
    if arg > 0:
        return 1
    elif arg == 0:
        return 0
    else:
        return -1
    
def ifelse(arg,a,b):
    if arg == True:
        return a
    else:
        return b
    

def mylasso(X,y,lam,n_iter = 50):
    """
    X: n-by-p design matrix,make sure it's scaled and centered 
    y:n-by-1 response vector,make sure it's centered 
    lam:lambda value
    n_iter: number of iterations
    """
    b = np.repeat(0.00000000, train_features_np.shape[1])
    r = y
    iteration = 0
    d = np.size(X, 1)
    while iteration < n_iter:
        for j in range(d):
            
            #update the residual vector
            r = r + np.dot(train_features_np[:, j],b[j])
            
            #apply one step lasso
            b[j] = one_step_lasso(r,X[:, j],lam)                                  
            r = r - np.dot(X[:, j],one_step_lasso(r,X[:, j],lam))
        iteration += 1
    return b

In [None]:
coefficient = mylasso(train_features_np,y_compute,17)

In [None]:
predicited = np.dot(test_features_np,coefficient)

In [None]:
rmse(scalery.inverse_transform(predicited),np.array(test_target).reshape((-1, )))

In [None]:
lambda_list = np.logspace(-10,2,100)

In [None]:
lambda_list.shape

In [None]:
lambda_optimal = pd.DataFrame(columns=['lambda','RMSE'])

In [None]:
lambda_optimal['lambda'] = lambda_list

In [None]:
for i in range(100):
    lam = lambda_list[i]
    coefficient = mylasso(train_features_np,y_compute,lam)
    predicited = np.dot(test_features_np,coefficient)
    result = rmse(scalery.inverse_transform(predicited),np.array(test_target).reshape((-1, )))
    lambda_optimal['RMSE'][i] = result

In [None]:
random_results = pd.DataFrame(lambda_optimal).sort_values('RMSE', ascending = True)


In [None]:
random_results.head(3)

In [None]:
coefficient1 = mylasso(train_features_np,y_compute,0.869749)
predicited1 = test_features_np.dot(coefficient1)
result1 = rmse(scalery.inverse_transform(predicited1),np.array(test_target).reshape((-1, )))

In [None]:
test_features_np.dot(coefficient1)

In [None]:
test_features_np.shape

In [None]:
coefficient1.shape

In [None]:
test_features_np.dot(coefficient1)

In [2]:
np.exp(11)

59874.14171519782