In [1]:
# GROUP: Graphkrone
# Members: Marcello Negri     19-945-450
#          Riccardo Uslenghi  19-954-262

import numpy as np 
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt 
from tqdm import tqdm

from sklearn import cluster, datasets, mixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

df_train = pd.read_csv('train.csv',header=0,converters={'Sequence':str,'Active':int})
df_test = pd.read_csv('test.csv',header=0,converters={'Sequence':str})

data_train = df_train.to_numpy()
data_test = df_test.to_numpy()

trainY = list(data_train[:,1])
trainY = np.array(trainY)
trainX = data_train[:,0]

testX = np.ravel(data_test)
    
def preprocessing(data):
    letters = []
    for r in range(len(data)):
        for c in range(len(data[0])):
            letters.append(data[r][c])
    letters = list(set(letters))
    letters.sort()
    M = np.zeros((len(data),len(letters)*4))

    for r in range(len(data)):
        for c in range(len(data[0])):
            let_pos = letters.index(data[r][c])
            M[r,len(letters)*c+let_pos] = 1
    return M     

trainX = preprocessing(trainX)
testX = preprocessing(testX)

In [3]:
classifier = MLPClassifier(hidden_layer_sizes=(150, ),verbose=1, alpha=0.01)
classifier.fit(trainX, trainY)
output = classifier.predict(testX)
df_predictions = pd.DataFrame(output)
df_predictions.to_csv('prova.csv', index=False,header=False, float_format='%i')

Iteration 1, loss = 0.10928589
Iteration 2, loss = 0.05867617
Iteration 3, loss = 0.04816193
Iteration 4, loss = 0.04202379
Iteration 5, loss = 0.03850578
Iteration 6, loss = 0.03574497
Iteration 7, loss = 0.03370792
Iteration 8, loss = 0.03202049
Iteration 9, loss = 0.03100093
Iteration 10, loss = 0.03010861
Iteration 11, loss = 0.02922252
Iteration 12, loss = 0.02855093
Iteration 13, loss = 0.02801446
Iteration 14, loss = 0.02739912
Iteration 15, loss = 0.02699339
Iteration 16, loss = 0.02655498
Iteration 17, loss = 0.02617265
Iteration 18, loss = 0.02612270
Iteration 19, loss = 0.02562925
Iteration 20, loss = 0.02554028
Iteration 21, loss = 0.02530533
Iteration 22, loss = 0.02502807
Iteration 23, loss = 0.02488443
Iteration 24, loss = 0.02472261
Iteration 25, loss = 0.02472530
Iteration 26, loss = 0.02437151
Iteration 27, loss = 0.02445276
Iteration 28, loss = 0.02420219
Iteration 29, loss = 0.02413488
Iteration 30, loss = 0.02391509
Iteration 31, loss = 0.02402275
Iteration 32, los

In [None]:
"""hidden_layer_size=[(1,), (5, ), (50, ), (100, ), (1000, ),
                                   (5, 5, ), (50, 50, ), (100, 100), 
                                   (50, 50, 50), (100, 100, 100)],"""

In [10]:
import random

kf = KFold(n_splits=3)
count = 0

for train_index, test_index in kf.split(trainX):
    count += 1
    
    X_train, X_test = trainX[train_index], trainX[test_index]
    Y_train, Y_test = trainY[train_index], trainY[test_index]
    
    classifier = MLPClassifier(hidden_layer_sizes=(70, ),verbose=1, alpha=0.01)
    classifier.fit(X_train, Y_train)
    
    print(f1_score(Y_test, classifier.predict(X_test)))
    if count > 0:
        break

Iteration 1, loss = 0.16934825
Iteration 2, loss = 0.07410524
Iteration 3, loss = 0.06367433
Iteration 4, loss = 0.05739500
Iteration 5, loss = 0.05185753
Iteration 6, loss = 0.04768992
Iteration 7, loss = 0.04444686
Iteration 8, loss = 0.04187418
Iteration 9, loss = 0.03991947
Iteration 10, loss = 0.03814631
Iteration 11, loss = 0.03683469
Iteration 12, loss = 0.03556932
Iteration 13, loss = 0.03450341
Iteration 14, loss = 0.03360656
Iteration 15, loss = 0.03280961
Iteration 16, loss = 0.03216099
Iteration 17, loss = 0.03162576
Iteration 18, loss = 0.03092473
Iteration 19, loss = 0.03029933
Iteration 20, loss = 0.02996255
Iteration 21, loss = 0.02943371
Iteration 22, loss = 0.02897688
Iteration 23, loss = 0.02862087
Iteration 24, loss = 0.02828063
Iteration 25, loss = 0.02785631
Iteration 26, loss = 0.02765467
Iteration 27, loss = 0.02729413
Iteration 28, loss = 0.02714161
Iteration 29, loss = 0.02693644
Iteration 30, loss = 0.02663314
Iteration 31, loss = 0.02638958
Iteration 32, los

In [None]:
150: 0.8959390862944162, 0.8846391116594694, 0.8859447004608294, 0.9009324009324009, 0.8864734299516909
150: 0.8809438684304612, 0.8813559322033898, 0.8750923872875092
160: 0.8933753943217666, 0.8852459016393444, 0.887182516243355, 0.8878449794480329, 0.8860135551447936
80: 0.8906832298136647, 0.8902665840049596, 0.8844339622641509, 0.8948004836759372, 0.8792569659442724
80: 0.882800608828006, 0.8735716918540362, 0.8841107871720116

In [15]:
from sklearn.model_selection import GridSearchCV
para_prova = {'hidden_layer_sizes':[(10,),(20,),(30,),(40,),(50,),(60,),(70,),(80,),(90,),(100,),(110,),(120,),(130,),(140,),(150,)], 'verbose':[0], 'alpha':[0.01]}
classifier = MLPClassifier()
gscv = GridSearchCV(classifier, para_prova, scoring='f1')
gscv.fit(trainX, trainY)
results = sorted(gscv.cv_results_.keys())

In [16]:
results = gscv.cv_results_
#print(f1_score(trainY, gscv.predict(trainX)))
print(results)

{'mean_fit_time': array([ 28.02545171,  43.8755414 ,  43.14573073,  51.82781878,
        52.19233904,  63.87303081,  63.36420608,  72.57059875,
        75.6119781 ,  65.703058  ,  75.27948222,  75.36077561,
        84.42967792,  74.27325039, 427.37494116]), 'std_fit_time': array([  6.20787567,   6.98076283,   4.9862179 ,   5.02727091,
         3.76434641,   7.34056419,   9.50203306,  12.85735288,
         6.62086386,   7.82804782,   6.16873202,   4.6030764 ,
        10.01267594,  10.39593655, 695.00096787]), 'mean_score_time': array([0.02500134, 0.03760247, 0.04480596, 0.05520453, 0.06320276,
       0.07400899, 0.0825223 , 0.09008794, 0.10161042, 0.10717158,
       0.11890469, 0.1114109 , 0.13122296, 0.14004993, 0.13243589]), 'std_score_time': array([0.00389874, 0.00445473, 0.00336754, 0.00172037, 0.00397304,
       0.00726451, 0.00362919, 0.00482531, 0.00450431, 0.00596548,
       0.00781748, 0.00752338, 0.0071271 , 0.01919354, 0.01549132]), 'param_alpha': masked_array(data=[0.01, 0.0

In [None]:
{'mean_fit_time': array([ 28.02545171,  43.8755414 ,  43.14573073,  51.82781878,
        52.19233904,  63.87303081,  63.36420608,  72.57059875,
        75.6119781 ,  65.703058  ,  75.27948222,  75.36077561,
        84.42967792,  74.27325039, 427.37494116]), 'std_fit_time': array([  6.20787567,   6.98076283,   4.9862179 ,   5.02727091,
         3.76434641,   7.34056419,   9.50203306,  12.85735288,
         6.62086386,   7.82804782,   6.16873202,   4.6030764 ,
        10.01267594,  10.39593655, 695.00096787]), 'mean_score_time': array([0.02500134, 0.03760247, 0.04480596, 0.05520453, 0.06320276,
       0.07400899, 0.0825223 , 0.09008794, 0.10161042, 0.10717158,
       0.11890469, 0.1114109 , 0.13122296, 0.14004993, 0.13243589]), 'std_score_time': array([0.00389874, 0.00445473, 0.00336754, 0.00172037, 0.00397304,
       0.00726451, 0.00362919, 0.00482531, 0.00450431, 0.00596548,
       0.00781748, 0.00752338, 0.0071271 , 0.01919354, 0.01549132]), 'param_alpha': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
                   0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_hidden_layer_sizes': masked_array(data=[(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,),
                   (90,), (100,), (110,), (120,), (130,), (140,), (150,)],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_verbose': masked_array(data=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01, 'hidden_layer_sizes': (10,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (20,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (30,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (40,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (50,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (60,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (70,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (80,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (90,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (100,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (110,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (120,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (130,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (140,), 'verbose': 0}, {'alpha': 0.01, 'hidden_layer_sizes': (150,), 'verbose': 0}], 'split0_test_score': array([0.8580209 , 0.87789988, 0.88340531, 0.88795686, 0.88861386,
       0.89473684, 0.89776165, 0.89982217, 0.89073634, 0.88522589,
       0.89596083, 0.8897014 , 0.88648649, 0.89667674, 0.90070093]), 'split1_test_score': array([0.83528686, 0.86399041, 0.86544536, 0.86557789, 0.88461538,
       0.88916563, 0.88369231, 0.8952381 , 0.87686567, 0.88902821,
       0.88618381, 0.87890137, 0.89185905, 0.88326127, 0.89212121]), 'split2_test_score': array([0.81607795, 0.85626157, 0.87843616, 0.87463214, 0.88363851,
       0.85909373, 0.88428745, 0.87322202, 0.88310115, 0.88125377,
       0.88496652, 0.87243311, 0.87594937, 0.88578372, 0.89580838]), 'split3_test_score': array([0.85006196, 0.85968254, 0.87150838, 0.88604509, 0.88755981,
       0.88983051, 0.88428745, 0.88238879, 0.88380952, 0.89078707,
       0.88848921, 0.88668731, 0.90657022, 0.88206388, 0.90394089]), 'split4_test_score': array([0.8296837 , 0.85732403, 0.863581  , 0.87407407, 0.87945879,
       0.88058788, 0.88357843, 0.87306502, 0.87017099, 0.87816945,
       0.88942308, 0.8830234 , 0.88598575, 0.88109394, 0.8858006 ]), 'mean_test_score': array([0.83782627, 0.86303169, 0.87247524, 0.87765721, 0.88477727,
       0.88268292, 0.88672146, 0.88474722, 0.88093674, 0.88489288,
       0.88900469, 0.88214932, 0.88937018, 0.88577591, 0.8956744 ]), 'std_test_score': array([0.01485446, 0.00789444, 0.00754269, 0.00829867, 0.00322711,
       0.01264318, 0.00552791, 0.01106507, 0.00694902, 0.00469896,
       0.00382342, 0.0060555 , 0.01001922, 0.00567189, 0.00638335]), 'rank_test_score': array([15, 14, 13, 12,  7,  9,  4,  8, 11,  6,  3, 10,  2,  5,  1])}