In [9]:
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from time import time

def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label == "+" else -1, words.split())

def build_vocabulary(trainfile, min_count=2):
    word_counts = Counter()
    for _, words in read_from(trainfile):
        word_counts.update(words)
    return set(word for word, count in word_counts.items() if count >= min_count)

def convert_to_matrix(trainfile, vocabulary):
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    data = []
    labels = []
    for label, words in read_from(trainfile):
        data.append(' '.join(words))
        labels.append(label)
    X = vectorizer.fit_transform(data)
    y = np.array(labels)
    return X, y

def train_with_sklearn(trainfile, devfile, epochs=10):
    vocabulary = build_vocabulary(trainfile)
    X_train, y_train = convert_to_matrix(trainfile, vocabulary)
    X_dev, y_dev = convert_to_matrix(devfile, vocabulary)
    clf = SGDClassifier(loss='hinge')
    for epoch in range(epochs):
        start_time = time()
        clf.partial_fit(X_train, y_train, classes=np.unique(y_train))
        y_pred_dev = clf.predict(X_dev)
        end_time = time()
        dev_error_rate = 1 - accuracy_score(y_dev, y_pred_dev)
        running_time = end_time - start_time
        print(f"Epoch {epoch+1}/{epochs}, Development Error Rate: {dev_error_rate * 100:.2f}%, Running Time: {running_time:.2f} seconds")
    return clf

trainfile = 'train.txt'
devfile = 'dev.txt'
classifier = train_with_sklearn(trainfile, devfile)


Epoch 1/10, Development Error Rate: 35.70%, Running Time: 0.01 seconds
Epoch 2/10, Development Error Rate: 30.80%, Running Time: 0.00 seconds
Epoch 3/10, Development Error Rate: 28.70%, Running Time: 0.00 seconds
Epoch 4/10, Development Error Rate: 30.40%, Running Time: 0.00 seconds
Epoch 5/10, Development Error Rate: 27.10%, Running Time: 0.00 seconds
Epoch 6/10, Development Error Rate: 28.10%, Running Time: 0.00 seconds
Epoch 7/10, Development Error Rate: 27.30%, Running Time: 0.00 seconds
Epoch 8/10, Development Error Rate: 27.40%, Running Time: 0.00 seconds
Epoch 9/10, Development Error Rate: 27.40%, Running Time: 0.00 seconds
Epoch 10/10, Development Error Rate: 27.70%, Running Time: 0.00 seconds
