In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from cipher_data import CipherTxtData
from utils import accuracy_score_scalers, save_results

In [4]:
def fit(X, y, smoothing=1):
    nb = MultinomialNB(alpha=smoothing)
    nb.fit(X, y)

    return nb

def predict(nb, X):
    return nb.predict(X)

In [5]:
def tfid_transform(train_data, dev_data, lo, hi):
    vectorizer = TfidfVectorizer(lowercase=False, binary=True, analyzer='word', ngram_range=(lo,hi))
    
    X_train = vectorizer.fit_transform(train_data)
    X_dev = vectorizer.transform(dev_data)

    return X_train, X_dev

In [6]:
train_data = CipherTxtData(mode="train", split=False)
dev_data = CipherTxtData(mode="dev", split=False)

In [7]:
## Doing this to cross-validate
X = train_data.X
X.extend(dev_data.X)

y = train_data.y
y.extend(dev_data.y)

assert len(y) == len(X)

In [13]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.15) 
X_train, X_dev = tfid_transform(X_train, X_dev, 1, 5)

In [14]:
from sklearn.cluster import KMeans

In [15]:
model = KMeans(n_clusters=2)

In [16]:
model.fit(X_train)

KMeans(n_clusters=2)

In [23]:
pred = model.predict(X_dev)

In [24]:
score_kmeans(pred, y_dev)

0.545653761869978

In [21]:
def score_kmeans(predictions, truth):
    score = 0
    for pred, true in zip(predictions, truth):
        if pred == true:
            score += 1
        
    return score / len(truth)

In [1]:
X_train_, X_dev_, y_train, y_dev = train_test_split(X, y, test_size=0.2) 

NameError: name 'train_test_split' is not defined

In [18]:
len(X_train_)

14597

In [15]:
X_train, X_dev = tfid_transform(X_train_, X_dev_, 1, 5)

In [16]:
X_train.shape

(14597, 591060)

In [None]:
for i in range(5):
    print(f"Random split {i}")
    X_train_, X_dev_, y_train, y_dev = train_test_split(X, y, test_size=0.11) 
    print("Performance on dev dataset:")
    
    best = 0
    for lo in range(1, 2):
        for hi in range(lo + 1, 10):
            X_train, X_dev = tfid_transform(X_train_, X_dev_, lo, hi)

            model = fit(X_train, y_train, smoothing=1)
            y_pred = predict(model, X_dev)
            score = accuracy_score_scalers(y_dev, y_pred)
            if score > best:
                print("\tbest so far")
                best = score
            print(f"\t (lo,hi): {lo,hi} \tscore: {score:0.3f}")

In [84]:
X, y
test_data = CipherTxtData(mode="test", split=False)

In [85]:
lo, hi = (1, 5)

In [86]:
X_train, X_test = tfid_transform(X, test_data.X, lo, hi)

In [87]:
model = fit(X_train, y, smoothing=1)
y_test = predict(model, X_test)

In [88]:
save_results(y_test)