In [1]:
%load_ext autoreload
%autoreload 2

In [107]:
from collections import Counter

import numpy as np
import gensim
from gensim.corpora import Dictionary
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from cipher_data import *
from utils import *

In [126]:
traindata = CipherTxtData(mode="train", split=True)
X = traindata.X
X_flattened = [word for sentence in X for word in sentence]

In [127]:
devdata = CipherTxtData(mode="dev", split=True)

In [128]:
n_sentences = len(X)
print(f"n_sentences = {n_sentences}")
n_words = len(X_flattened)
print(f"n_words = {n_words}")

n_sentences = 16220
n_words = 328882


In [129]:
lengths = np.array(list(map(len, X)))

print(f"lengths mean = {lengths.mean():0.3f}")
print(f"lengths std = {lengths.std():0.3f}")

print()

print(f"max length = {lengths.max()}")
print(f"min length = {lengths.min()}")

lengths mean = 20.276
lengths std = 9.365

max length = 56
min length = 1


In [130]:
word_counts = Counter(X_flattened)
word_counts.most_common()[10:15]

[('lkjl', 3746), ('jc', 2685), ('ütlk', 2310), ('zc', 2269), ('Úol', 2258)]

In [131]:
len(word_counts)

20860

In [132]:
# Let label = 1 be positive and label = 0 be negative
positive = []
negative = []

for x, y in zip(X, traindata.y):
    if y == 1:
        positive.append(x)
    else:
        negative.append(x)

In [133]:
#positive_words = set(word for seq in positive for word in seq)
#negative_words = set(word for seq in negative for word in seq)

#only_positive_words = positive_words - negative_words
#only_negative_words = negative_words - positive_words

#positive_word_counts = {word: count 
#                        for word, count in word_counts.items() if word in only_positive_words}
#positive_word_counts = Counter(positive_word_counts)

#negative_word_counts = {word: count 
#                        for word, count in word_counts.items() if word in only_negative_words}
#negative_word_counts = Counter(negative_word_counts)

#words = only_negative_words.union(only_positive_words)

In [134]:
stop_words_0 = word_counts.most_common()[:20]
stop_words = [x[0] for x in stop_words_0]

stop_words_1 = word_counts.most_common()[-20:]
stop_words_1 = [x[0] for x in stop_words_1]

stop_words.extend(stop_words_1)

stop_words = set(stop_words)

In [135]:
X_filtered = list(
    map(lambda seq: [word for word in seq if word not in stop_words], X)
)

In [136]:
filtered_lengths = np.array(list(map(len, X_filtered)))

In [137]:
filtered_lengths.mean()

12.914673242909988

In [138]:
filtered_lengths.std()

6.372151126062358

In [139]:
X_train = X_filtered
X_dev = list(
    map(lambda seq: [word for word in seq if word not in stop_words], devdata.X)
)

y_train = traindata.y
y_dev = devdata.y

In [140]:
X_train = list(
    map(lambda seq: ' '.join(word for word in seq), X_train)
)

X_dev = list(
    map(lambda seq: ' '.join(word for word in seq), X_dev)
)

In [141]:
vectorizer = TfidfVectorizer(lowercase=False, binary=True, analyzer='word', ngram_range=(1,6))
X_train = vectorizer.fit_transform(X_train)

X_dev = vectorizer.transform(X_dev)

In [142]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_dev)

accuracy_score_scalers(y_dev, y_pred)

0.874198322644302

In [143]:
from sklearn.naive_bayes import MultinomialNB

In [144]:
def fit(X, y, smoothing=1):
    nb = MultinomialNB(alpha=smoothing)
    nb.fit(X, y)

    return nb

def predict(nb, X):
    return nb.predict(X)

In [147]:
model = fit(X_train, y_train, smoothing=1)
y_pred = predict(model, X_dev)

In [148]:
accuracy_score_scalers(y_dev, y_pred)

0.8904785397138628