In [54]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from cipher_data import CipherTxtData
from utils import accuracy_score_scalers, save_results

In [56]:
def fit(X, y, smoothing=1):
    nb = MultinomialNB(alpha=smoothing)
    nb.fit(X, y)

    return nb

def predict(nb, X):
    return nb.predict(X)

In [79]:
def tfid_transform(train_data, dev_data, lo, hi):
    vectorizer = TfidfVectorizer(lowercase=False, binary=True, analyzer='word', ngram_range=(lo,hi))
    
    X_train = vectorizer.fit_transform(train_data)
    X_dev = vectorizer.transform(dev_data)

    return X_train, X_dev

In [58]:
train_data = CipherTxtData(mode="train", split=False)
dev_data = CipherTxtData(mode="dev", split=False)

In [59]:
## Doing this to cross-validate
X = train_data.X
X.extend(dev_data.X)

y = train_data.y
y.extend(dev_data.y)

assert len(y) == len(X)

In [60]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.15) 

In [83]:
for i in range(5):
    print(f"Random split {i}")
    X_train_, X_dev_, y_train, y_dev = train_test_split(X, y, test_size=0.11) 
    print("Performance on dev dataset:")
    
    best = 0
    for lo in range(1, 2):
        for hi in range(lo + 1, 10):
            X_train, X_dev = tfid_transform(X_train_, X_dev_, lo, hi)

            model = fit(X_train, y_train, smoothing=1)
            y_pred = predict(model, X_dev)
            score = accuracy_score_scalers(y_dev, y_pred)
            if score > best:
                print("\tbest so far")
                best = score
            print(f"\t (lo,hi): {lo,hi} \tscore: {score:0.3f}")

Random split 0
Performance on dev dataset:
	best so far
	 (lo,hi): (1, 2) 	score: 0.885
	best so far
	 (lo,hi): (1, 3) 	score: 0.889
	best so far
	 (lo,hi): (1, 4) 	score: 0.893
	 (lo,hi): (1, 5) 	score: 0.892
	 (lo,hi): (1, 6) 	score: 0.893
	best so far
	 (lo,hi): (1, 7) 	score: 0.893
	 (lo,hi): (1, 8) 	score: 0.893
	 (lo,hi): (1, 9) 	score: 0.893
Random split 1
Performance on dev dataset:
	best so far
	 (lo,hi): (1, 2) 	score: 0.872
	best so far
	 (lo,hi): (1, 3) 	score: 0.878
	 (lo,hi): (1, 4) 	score: 0.876
	 (lo,hi): (1, 5) 	score: 0.876
	 (lo,hi): (1, 6) 	score: 0.876
	 (lo,hi): (1, 7) 	score: 0.876
	 (lo,hi): (1, 8) 	score: 0.876
	 (lo,hi): (1, 9) 	score: 0.876
Random split 2
Performance on dev dataset:
	best so far
	 (lo,hi): (1, 2) 	score: 0.887
	best so far
	 (lo,hi): (1, 3) 	score: 0.893
	best so far
	 (lo,hi): (1, 4) 	score: 0.895
	best so far
	 (lo,hi): (1, 5) 	score: 0.896
	best so far
	 (lo,hi): (1, 6) 	score: 0.896
	 (lo,hi): (1, 7) 	score: 0.896
	 (lo,hi): (1, 8) 	score

In [84]:
X, y
test_data = CipherTxtData(mode="test", split=False)

In [85]:
lo, hi = (1, 5)

In [86]:
X_train, X_test = tfid_transform(X, test_data.X, lo, hi)

In [87]:
model = fit(X_train, y, smoothing=1)
y_test = predict(model, X_test)

In [88]:
save_results(y_test)