Hobson, this here summarizes the issue I've been getting with the 'hyper_table'. You'll see that the output dictionaries, hyper_params0 and hyper_params1 contains all the necessary information when printed out immediately after each operation. However, when I writeout the final appended hyper_table, I find that each model's output is **overwritten** by the last run.

In [1]:
import numpy as np
import pandas as pd

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.preprocessing import StandardScaler

In [74]:
df = pd.read_csv('test_batch4_rs_1k.csv')

In [76]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [78]:
# list of 'Blocks' of features used in testing:

# Features excluded from model tests:
excluded_features = ['ID', 'Source', 'TMC', 'End_Lat', 'End_Lng', 'Street', 'City', 'Country','Description',
                     'Number','End_Time', 'County','Zipcode','Timezone', 'Airport_Code', 
                     'Civil_Twilight', 'Astronomical_Twilight', 'Nautical_Twilight', 'Weather_Timestamp',
                    'Start_Time']

# Basic numeric features:
basic_features = ['Severity','Start_Lat', 'Start_Lng', 'Distance(mi)', 'Precipitation_filled',
                 'Wind_Speed_filled_median', 'Humidity_filled_mean', 'Temperature_filled_mean',
                 'Visibility_filled_mean', 'Pressure_filled_mean', 'Wind_Chill_predicted_filled']


# Features replaced by *filled_mean or median:
replaced_features = ['Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
                    'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)',
                    'Wind_Speed_filled_mean', 'Temperature_filled_median']

# Traffic points of interest:
features_all_points_of_interest_bool = ['Traffic_Signal', 'Junction', 'Amenity','Bump',
                                    'Crossing','Give_Way','No_Exit', 'Railway','Roundabout',
                                    'Stop','Station','Traffic_Calming','Turning_Loop']

# wasnull features:
features_weather_wasnull = ['Precipitation_wasnull','Wind_Chill_wasnull','Wind_Speed_wasnull',
                            'Visibility_wasnull','Humidity_wasnull','Temperature_wasnull',
                            'Pressure_wasnull']


In [81]:
# Split data into features and target data:
X = test_batch4_rs_1k.iloc[:,1:]
y = test_batch4_rs_1k.iloc[:,0]

In [82]:
hyper_table = []

In [83]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [84]:
hyper_params0 = {}

In [85]:
normalizer = [True, False]
alphas =[0.001, 0.01]
for norm in normalizer:
    for alpha in alphas:
        #run Lasso using True / False
        lasso = Lasso(alpha=alpha, normalize=norm, max_iter=100, selection='cyclic', 
                      fit_intercept=True, precompute=False, tol=0.0001)
        hyper_params0['model'] = lasso.__class__.__name__
        hyper_params0['alpha'] = lasso.alpha
        hyper_params0['normalize'] = lasso.normalize
        hyper_params0['fit_intercept'] = lasso.fit_intercept

        # fit
        lasso.fit(X_train,y_train)

        #predict
        lasso_pred = lasso.predict(X_test)

        #Score
        test_score = lasso.score(X_test,y_test)
        train_score = lasso.score(X_train, y_train)
        hyper_params0['test_score']= test_score
        hyper_params0['train_score']= train_score

        #RMSE
        test_rmse = sqrt(mean_squared_error(y_test, lasso_pred))
        hyper_params0['test_rmse'] = test_rmse

        hyper_table.append(hyper_params0)
        
        print(hyper_params0)
        

{'model': 'Lasso', 'alpha': 0.001, 'normalize': True, 'fit_intercept': True, 'test_score': 0.06335098931239613, 'train_score': 0.12404332653880057, 'test_rmse': 0.5221372476442108}
{'model': 'Lasso', 'alpha': 0.01, 'normalize': True, 'fit_intercept': True, 'test_score': -0.00011218413155478224, 'train_score': 0.0, 'test_rmse': 0.5395362079859805}
{'model': 'Lasso', 'alpha': 0.001, 'normalize': False, 'fit_intercept': True, 'test_score': 0.04787016080968198, 'train_score': 0.17979727473057894, 'test_rmse': 0.5264344769551053}
{'model': 'Lasso', 'alpha': 0.01, 'normalize': False, 'fit_intercept': True, 'test_score': 0.05847942014007612, 'train_score': 0.08852823476088688, 'test_rmse': 0.52349332066216}




hyper_params0, above has the expected information.

In [86]:
hyper_params1 = {}

In [87]:
# Scale/normalize the data:
scaler = StandardScaler()
scaler.fit(X_train)
Xn_train = scaler.transform(X_train)
Xn_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """


In [88]:

alphas =[0.001, 0.01]

for alpha in alphas:
    #run Lasso using True / False
    sgdregres = SGDRegressor(alpha=alpha, max_iter=10, loss='squared_loss',
                            fit_intercept=True, random_state=12, tol=0.001)
    hyper_params1['model'] = sgdregres.__class__.__name__
    hyper_params1['alpha'] = sgdregres.alpha
       
    hyper_params1['fit_intercept'] = sgdregres.fit_intercept

    # fit
    sgdregres.fit(Xn_train,y_train)

    #predict
    sgdregres_pred = lasso.predict(Xn_test)

    #Score
    test_score = sgdregres.score(Xn_test,y_test)
    train_score = sgdregres.score(Xn_train, y_train)
    hyper_params1['test_score']= test_score
    hyper_params1['train_score']= train_score
        
    #RMSE
    test_rmse = sqrt(mean_squared_error(y_test, sgdregres_pred))
    hyper_params1['test_rmse'] = test_rmse

    #print hyper_params1 dictionary
    print(hyper_params1)
    
    #append hyper_params1 to main hyper_table:
    hyper_table.append(hyper_params1)

{'model': 'SGDRegressor', 'alpha': 0.001, 'fit_intercept': True, 'test_score': -0.02922184366503533, 'train_score': 0.19840493336426968, 'test_rmse': 0.5397865955950206}
{'model': 'SGDRegressor', 'alpha': 0.01, 'fit_intercept': True, 'test_score': -0.026197459493702894, 'train_score': 0.1983894152809421, 'test_rmse': 0.5397865955950206}




hyper_params1 above has the expected information.

In [89]:
pd.DataFrame(hyper_table)


Unnamed: 0,alpha,fit_intercept,model,normalize,test_rmse,test_score,train_score
0,0.01,True,Lasso,False,0.523493,0.058479,0.088528
1,0.01,True,Lasso,False,0.523493,0.058479,0.088528
2,0.01,True,Lasso,False,0.523493,0.058479,0.088528
3,0.01,True,Lasso,False,0.523493,0.058479,0.088528
4,0.01,True,SGDRegressor,,0.539787,-0.026197,0.198389
5,0.01,True,SGDRegressor,,0.539787,-0.026197,0.198389


final hyper_table has the correct number of runs, but only the information of the last run in each algorithm is stored.