In [62]:
import pandas as pd
import numpy as np
import math
from tools import seq_to_num, acc_score


def create_system(sequence, start_index_a):
    '''
    :param sequence: list, where type(item)=int 
    :param order: recurrent relation order, int(min=2)
    :param start_index_a: int, form which index start
    :return: a,b (ax=b)
    '''
    # validation
    order = 6
    # 6 equations
    if len(sequence) < start_index_a + 6+2:
        srart_index_a=0
#         print("Impossible create system")
#         return '-100', '-100'
    if len(sequence) < start_index_a + 6:
        print("Impossible create system")
        return '-100', '-100'
    # x3=cx0^2+c1x1^2+c2x0x1+c3x0+c4x1+c5
    a = list()
    b = [sequence[i] for i in range(start_index_a + 2, start_index_a + 2 + order)]
    for i in range(start_index_a, start_index_a + order):
        a.append(create_nonlinear_polynom_equation(sequence[i], sequence[i + 1]))
    a = np.array(a)
    z = np.ones((6, 1))
    a = np.append(a, z, axis=1)
    b = np.array(b)
    return a, b


def create_nonlinear_polynom_equation(x1, x2):
    return [x1 ** 2, x2 ** 2, x1 * x2, x1, x2]


def predict_1(sequence, start_index_a):
    a, b = create_system(sequence, start_index_a)
    try:
        solution = np.linalg.solve(a, b)
    except np.linalg.LinAlgError:
        #print('numpy error')
        return '000'
    if check_solution(sequence, solution, start_index_a):
        x = sequence[-6:]
        pred_value = calculate_nonlinear_polynom([sequence[-2], sequence[-1]], solution)
        return pred_value
    else:
        return '0'


def calculate_nonlinear_polynom(x, solution):
    return sum([x[0] ** 2 * solution[0], x[1] ** 2 * solution[1], x[0] * x[1] * solution[2], x[0] * solution[3],
                x[1] * solution[4], solution[5]])


def make_prediction(data, start_index=3, maxlen=15, minlen=10, slice=15, verbose=False):
    predicted_values = []
    indices = []
    ind_iter = data.index if isinstance(data, (np.ndarray, pd.Series)) else list(range(len(data)))
    for ind, seq in zip(ind_iter, data):
        if len(seq) < minlen:
            continue
        sequence = seq[-maxlen:] if maxlen != -1 else seq
        pred_val = predict_1(sequence, start_index)
        if pred_val == '000' or pred_val == '0':
            continue
        predicted_values.append(np.round(pred_val))
        indices.append(ind)
    return predicted_values, indices


def check_solution(sequence, solution, start_index_a=3):
    n = len(sequence)
    # -1 bcs free coef
    for i in range(start_index_a, n - 2):
        # індекс не рахує останній елемент, тобто ми не знаємо останнього елементу
        x = np.array([sequence[j] for j in range(i, i + 2)])
        s = round(calculate_nonlinear_polynom(x, solution))
        if math.fabs(s - sequence[i + 2]) > 0.1:
            return False
    return True




In [63]:
import time
t=time.time()

df_train = pd.read_csv('train.csv', index_col=0)
# x = pd.read_csv('Good_indexes.csv')
# x = list(x['Good_indexes'])

train_X, train_y = seq_to_num(df_train.Sequence, pad=False)
print(train_X.shape)
pred, ind = make_prediction(train_X, start_index=3, minlen=12, maxlen=15, verbose=False)
print(len(ind))
print(acc_score(pred, train_y[ind]))
print(time.time()-t)

(113845,)
4322
0.8787598334104582
29.293150663375854


In [90]:
import itertools
max_length = list()
min_length = list()
stuff = [11, 12, 13, 14, 15, 16, 17, 18, 19,20]
for subset in itertools.combinations(stuff, 2):
    if subset[0]<subset[1]:
        min_length.append(subset[0])
        max_length.append(subset[1])
    


scores = list()
counts = list()
print(len(min_length))
print(len(max_length))

45
45


In [91]:
for max,min in zip(max_length,min_length):
    pred, ind = make_prediction(train_X, start_index=3, minlen=min, maxlen=max, verbose=False)
    score= acc_score(pred, train_y[ind])
    train_yy=list(train_y[ind])
    count=len([pred[i] for i in range(len(pred)) if math.fabs(train_yy[i]-pred[i])<0.1])
    print(score)
    print(count)
    scores.append(round(score, 3))
    counts.append(count)
print(len(scores))
print(len(counts))
print(len(max_length))
print(len(min_length))
d = {'Accuracy': scores, 'Count': counts,'Min_lenght_seq': min_length, 'Max_lenght_seq_slice': max_length}
data = pd.DataFrame(data=d)
data.to_csv('Hyper_parametres_11_20_non_linear_model.csv')
print(time.time() - t)


0.7051266513149921
5818
0.7481359976140769
5017
0.7609498680738787
4326
0.7572873289708507
3819
0.7657697397441553
3472
0.7643664342389997
3179
0.7601126472094214
2969
0.7547322877230935
2791
0.7602172049156902
2660
0.8347535505430242
4996
0.8672441579371475
4305
0.8787598334104582
3798
0.9050616312614739
3451
0.9185573007562536
3158
0.9255886970172684
2948
0.9304669129996641
2770
0.9499640028797696
2639
0.8682562133764397
4297
0.8799628511725098
3790
0.9065297525013165
3443
0.9202453987730062
3150
0.9274447949526814
2940
0.9324780553679946
2762
0.9522258414766558
2631
0.8802604045570798
3786
0.9069092827004219
3439
0.9206906643254317
3146
0.9279393173198482
2936
0.9330175913396481
2758
0.9528472977874501
2627
0.907118285260651
3428
0.9209753231492362
3135
0.9282767375436369
2925
0.9334012911994564
2747
0.9533527696793003
2616
0.9213780918727915
3129
0.9287305122494433
2919
0.933901192504259
2741
0.9539473684210527
2610
0.9288222151292691
2910
0.934017094017094
2732
0.9541452677916361
