In [1]:
from __future__ import print_function
from time import time
from itertools import combinations
from model_scoring_func import gen_terms_key, gen_X
from datahandling import access_db
from gen_model_inputs import get_all_lin_model_inp
from pandas import DataFrame
from tinydb import Query
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from numpy import mean
Q = Query()

from tinydb import TinyDB

In [2]:
sv_db = access_db(0, True)

In [3]:
measurements = DataFrame(sv_db.search(Q.equipment_name.exists() & Q.data_type.exists()))
measurements['name'] = measurements.equipment_name + ' ' + measurements.data_type
# This will automatically average the different measurements which repeat
measurements = measurements.pivot_table(index='sample_number', columns='name', values='value')

measurements = measurements.drop([u'tensile E_t_MPa_mean', 
                                  u'tensile epsilon_break_%_mean', 
                                  u'tensile epsilon_max_%_mean',
                                  u'tensile sigma_break_MPa_mean',
                                  u'tensile sigma_max_MPa_mean',
                                  u'thermomat int_of_abs_err',
                                  u'ConeCal C-factor'
                                 ], axis=1)

Ys = measurements

Ys = Ys - Ys.min()
Ys = Ys/Ys.max()
Ys = Ys*2 - 1

In [4]:
all_full_input = get_all_lin_model_inp()
model_obj = LinearRegression(fit_intercept=False)

In [5]:
all_names = Ys.columns

In [6]:
db = access_db(2, True)

In [7]:
t = time()

for i in range(28):
    t_split = time()
    
    number_of_terms = i + 1
    
    if db.contains(Q.n_terms_done == number_of_terms):
        print('Data with', number_of_terms, 'terms already done')
        print('---------------')
        continue

    terms_key = gen_terms_key()
    
    # Generate all possible models
    for i in combinations(list(range(28)), number_of_terms):
        invalid = False

        for j in i:
            if j >= 7:
                key_1 = terms_key[j][0]
                key_2 = terms_key[j][1]
                if key_1 not in i or key_2 not in i:
                    invalid = True
        
        if not invalid:
            
            # Loop through columns in measurement to gen Ys
            first = True
            for column in all_names:
                Y = Ys[column].dropna().values
                sn_Y = Ys[column].dropna().index
                
                # Generate X for certain model and Y
                X = gen_X(sn_Y, all_full_input, i)
                my_cv = ShuffleSplit(len(Y), n_iter=3, test_size=0.333, random_state=0)
                scores = cross_val_score(model_obj, X, Y, cv=my_cv)
                score = mean(scores)
                
                equip, d_type = column.split(' ')
                
                my_Q = ((Q.equipment_name == equip) &
                        (Q.data_type == d_type) &
                        (Q.n_terms == number_of_terms))

                at_least_once = db.contains(my_Q)
            
                if not at_least_once:
                    entry = {'equipment_name': equip,
                             'data_type': d_type,
                             'n_terms': number_of_terms,
                             'top_score': score,
                             'top_mcode': list(i)
                            }
                    db.insert(entry)
                    continue
                
                top_score = db.search(my_Q)[0]['top_score']

                if score > top_score:
                    db.update({'top_score': score, 'top_mcode': list(i)}, my_Q)
    
    db.insert({'n_terms_done': number_of_terms})
            
    print('n_terms', number_of_terms)
#     print('models', cnt)
    req_time = time() - t_split
    minutes, seconds = divmod(req_time, 60)
    print('Split time:', round(minutes), 'min', round(seconds), 's')
    req_time = time() - t
    minutes, seconds = divmod(req_time, 60)
    print('Total time so far:', round(minutes), 'min', round(seconds), 's')
    print('---------------')

n_terms 1
Split time: 0.0 min 1.0 s
Total time so far: 0.0 min 1.0 s
---------------
n_terms 2
Split time: 0.0 min 5.0 s
Total time so far: 0.0 min 6.0 s
---------------
n_terms 3
Split time: 0.0 min 15.0 s
Total time so far: 0.0 min 21.0 s
---------------
n_terms 4
Split time: 0.0 min 43.0 s
Total time so far: 1.0 min 4.0 s
---------------
n_terms 5
Split time: 1.0 min 57.0 s
Total time so far: 3.0 min 1.0 s
---------------
n_terms 6
Split time: 5.0 min 2.0 s
Total time so far: 8.0 min 3.0 s
---------------
n_terms 7
Split time: 12.0 min 38.0 s
Total time so far: 20.0 min 41.0 s
---------------
n_terms 8
Split time: 30.0 min 22.0 s
Total time so far: 51.0 min 3.0 s
---------------
n_terms 9
Split time: 69.0 min 3.0 s
Total time so far: 120.0 min 5.0 s
---------------
n_terms 10
Split time: 153.0 min 38.0 s
Total time so far: 273.0 min 43.0 s
---------------
n_terms 11
Split time: 385.0 min 56.0 s
Total time so far: 659.0 min 39.0 s
---------------


KeyboardInterrupt: 