In [1]:
import fasttext
from fasttext import FastText

from cipher_data import *
from utils import *

In [2]:
traindata = CipherTxtData(mode="train", split=False)
devdata = CipherTxtData(mode="dev", split=False)

In [3]:
X_train = traindata.X

In [4]:
def save_to_txt(text, labels=None, file_name="test.txt"):
    with open(file_name, "w") as f:
        if labels:
            for seq, label in zip(text, labels):
                f.write(f"__label__{str(label)} {seq}\n")
        else:
            for seq in text:
                f.write(seq)

In [5]:
def predict_fasttext(model, data):
    predictions = []
    for text in data:
        pred = model.predict(text)
        predictions.append(pred)
    
    return predictions

In [6]:
save_to_txt(X_train, traindata.y, "train.txt")

In [7]:
save_to_txt(devdata.X, devdata.y, "dev.txt")

In [8]:
model = fasttext.train_supervised(input='train.txt', autotuneValidationFile='dev.txt')

Progress: 100.0% Trials:   74 Best score:  0.903305 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  20861
Number of labels: 2
Progress: 100.0% words/sec/thread: 1849054 lr:  0.000000 avg.loss:  0.043786 ETA:   0h 0m 0s


In [10]:
dev_predictions = predict_fasttext(model, devdata.X)
dev_score = score_fasttext(dev_predictions, devdata.y)

In [11]:
dev_score

0.903305377405032

In [16]:
for min_count in range(1, 50, 2):
    model = FastText.train_supervised("train.txt", epoch=20, wordNgrams=2, minCount=min_count)

    dev_predictions = predict_fasttext(model, devdata.X)
    dev_score = score_fasttext(dev_predictions, devdata.y)

    train_predictions = predict_fasttext(model, traindata.X)
    train_score = score_fasttext(train_predictions, traindata.y)
    
    print(f"min count={min_count}")
    print(f"\ttrain score={train_score}")
    print(f"\tdev score={dev_score}")

Read 0M words
Number of words:  20861
Number of labels: 2
Progress: 100.0% words/sec/thread: 1951502 lr:  0.000000 avg.loss:  0.144839 ETA:   0h 0m 0s


min count=1
	train score=0.9909987669543773
	dev score=0.9013320177602367


Read 0M words
Number of words:  9253
Number of labels: 2
Progress: 100.0% words/sec/thread: 1991824 lr:  0.000000 avg.loss:  0.148923 ETA:   0h 0m 0s


min count=3
	train score=0.9908138101109741
	dev score=0.8998519980266404


Read 0M words
Number of words:  6081
Number of labels: 2
Progress: 100.0% words/sec/thread: 1990129 lr:  0.000000 avg.loss:  0.143150 ETA:   0h 0m 0s


min count=5
	train score=0.9909371146732429
	dev score=0.903305377405032


Read 0M words
Number of words:  4532
Number of labels: 2
Progress: 100.0% words/sec/thread: 1581663 lr:  0.000000 avg.loss:  0.159171 ETA:   0h 0m 0s


min count=7
	train score=0.9910604192355117
	dev score=0.9023186975826344


Read 0M words
Number of words:  3607
Number of labels: 2
Progress: 100.0% words/sec/thread: 1963461 lr:  0.000000 avg.loss:  0.139206 ETA:   0h 0m 0s


min count=9
	train score=0.9908754623921086
	dev score=0.8983719782930439


Read 0M words
Number of words:  2974
Number of labels: 2
Progress: 100.0% words/sec/thread: 1970724 lr:  0.000000 avg.loss:  0.146994 ETA:   0h 0m 0s


min count=11
	train score=0.9909371146732429
	dev score=0.8983719782930439


Read 0M words
Number of words:  2533
Number of labels: 2
Progress: 100.0% words/sec/thread: 1971116 lr:  0.000000 avg.loss:  0.132879 ETA:   0h 0m 0s
Read 0M words
Number of words:  2206
Number of labels: 2


min count=13
	train score=0.9906905055487053
	dev score=0.8944252590034534


Progress: 100.0% words/sec/thread: 1966292 lr:  0.000000 avg.loss:  0.147001 ETA:   0h 0m 0s


min count=15
	train score=0.9910604192355117
	dev score=0.895411938825851


Read 0M words
Number of words:  1968
Number of labels: 2
Progress: 100.0% words/sec/thread: 1975949 lr:  0.000000 avg.loss:  0.147597 ETA:   0h 0m 0s
Read 0M words
Number of words:  1766
Number of labels: 2


min count=17
	train score=0.9908754623921086
	dev score=0.8949185989146522


Progress: 100.0% words/sec/thread: 1975764 lr:  0.000000 avg.loss:  0.156106 ETA:   0h 0m 0s


min count=19
	train score=0.9909371146732429
	dev score=0.8944252590034534


Read 0M words
Number of words:  1574
Number of labels: 2
Progress: 100.0% words/sec/thread: 1953247 lr:  0.000000 avg.loss:  0.139685 ETA:   0h 0m 0s100.0% words/sec/thread: 1953681 lr: -0.000003 avg.loss:  0.139685 ETA:   0h 0m 0s
Read 0M words
Number of words:  1443
Number of labels: 2


min count=21
	train score=0.9909371146732429
	dev score=0.8934385791810557


Progress: 100.0% words/sec/thread: 1969137 lr:  0.000000 avg.loss:  0.146275 ETA:   0h 0m 0s


min count=23
	train score=0.9911220715166461
	dev score=0.8944252590034534


Read 0M words
Number of words:  1312
Number of labels: 2
Progress: 100.0% words/sec/thread: 1978384 lr:  0.000000 avg.loss:  0.141431 ETA:   0h 0m 0s
Read 0M words
Number of words:  1223
Number of labels: 2


min count=25
	train score=0.9910604192355117
	dev score=0.895411938825851


Progress: 100.0% words/sec/thread: 2003542 lr:  0.000000 avg.loss:  0.140429 ETA:   0h 0m 0s0.140429 ETA:   0h 0m 0s
Read 0M words
Number of words:  1140
Number of labels: 2


min count=27
	train score=0.9908138101109741
	dev score=0.8973852984706463


Progress: 100.0% words/sec/thread: 1970233 lr:  0.000000 avg.loss:  0.146761 ETA:   0h 0m 0s
Read 0M words
Number of words:  1060
Number of labels: 2


min count=29
	train score=0.9910604192355117
	dev score=0.895411938825851


Progress: 100.0% words/sec/thread: 1990411 lr:  0.000000 avg.loss:  0.154355 ETA:   0h 0m 0s
Read 0M words
Number of words:  981
Number of labels: 2


min count=31
	train score=0.9909371146732429
	dev score=0.8939319190922546


Progress: 100.0% words/sec/thread: 1958063 lr:  0.000000 avg.loss:  0.151011 ETA:   0h 0m 0s


min count=33
	train score=0.9911220715166461
	dev score=0.8889985199802664


Read 0M words
Number of words:  921
Number of labels: 2
Progress: 100.0% words/sec/thread: 1424639 lr:  0.000000 avg.loss:  0.161234 ETA:   0h 0m 0s


min count=35
	train score=0.9909987669543773
	dev score=0.8919585594474593


Read 0M words
Number of words:  866
Number of labels: 2
Progress: 100.0% words/sec/thread: 1977472 lr:  0.000000 avg.loss:  0.145436 ETA:   0h 0m 0s


min count=37
	train score=0.9909371146732429
	dev score=0.8944252590034534


Read 0M words
Number of words:  832
Number of labels: 2
Progress: 100.0% words/sec/thread: 1979677 lr:  0.000000 avg.loss:  0.144156 ETA:   0h 0m 0s
Read 0M words
Number of words:  785
Number of labels: 2


min count=39
	train score=0.9906288532675709
	dev score=0.8934385791810557


Progress: 100.0% words/sec/thread: 1987128 lr:  0.000000 avg.loss:  0.151947 ETA:   0h 0m 0s


min count=41
	train score=0.9909987669543773
	dev score=0.8919585594474593


Read 0M words
Number of words:  751
Number of labels: 2
Progress: 100.0% words/sec/thread: 1981121 lr:  0.000000 avg.loss:  0.153435 ETA:   0h 0m 0s


min count=43
	train score=0.9909371146732429
	dev score=0.8919585594474593


Read 0M words
Number of words:  711
Number of labels: 2
Progress: 100.0% words/sec/thread: 1994700 lr:  0.000000 avg.loss:  0.143566 ETA:   0h 0m 0s


min count=45
	train score=0.9909371146732429
	dev score=0.8919585594474593


Read 0M words
Number of words:  676
Number of labels: 2
Progress: 100.0% words/sec/thread: 2012976 lr:  0.000000 avg.loss:  0.146265 ETA:   0h 0m 0s


min count=47
	train score=0.9911220715166461
	dev score=0.8904785397138628


Read 0M words
Number of words:  640
Number of labels: 2
Progress: 100.0% words/sec/thread: 1986234 lr:  0.000000 avg.loss:  0.146159 ETA:   0h 0m 0s


min count=49
	train score=0.9908138101109741
	dev score=0.8894918598914652


In [109]:
model.test("dev.txt")

(2027, 0.9018253576714356, 0.9018253576714356)

In [95]:
predictions = []
for text in traindata.X:
    y_pred = model.predict(text)
    predictions.append(y_pred)
score_fasttext(predictions, traindata.y)

0.9910604192355117

In [9]:
def score_fasttext(predictions, truth):
    score = 0
    for pred, true in zip(predictions, truth):
        if pred[0][0] == "__label__1" and true == 1:
            score += 1
        elif pred[0][0] == "__label__0" and true == 0:
            score += 1
    return score / len(truth)