In [13]:
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from time import time

def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label == "+" else -1, words.split())

def build_vocabulary(trainfile):
    word_counts = Counter()
    for _, words in read_from(trainfile):
        word_counts.update(words)
    return set(word for word, count in word_counts.items() if count > 1)

def get_data_and_labels(file, vocabulary):
    data, labels = [], []
    for label, words in read_from(file):
        # Filter words based on the vocabulary.
        data.append(' '.join([word for word in words if word in vocabulary]))
        labels.append(label)
    return data, labels

trainfile = 'train.txt'
devfile = 'dev.txt'

vocabulary = build_vocabulary(trainfile)

# Get the data ready for training and evaluation
train_data, train_labels = get_data_and_labels(trainfile, vocabulary)
dev_data, dev_labels = get_data_and_labels(devfile, vocabulary)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_dev = vectorizer.transform(dev_data)

# Convert labels to a numpy array, adjusting labels to fit sklearn's expected format
y_train = np.array([1 if label == 1 else 0 for label in train_labels])
y_dev = np.array([1 if label == 1 else 0 for label in dev_labels])

# Initialize the neural network classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam')

start_time = time()

# Train the classifier
mlp.fit(X_train, y_train)

# Predictions on the development set
dev_predictions = mlp.predict(X_dev)

# Measure elapsed time
end_time = time()

dev_accuracy = accuracy_score(y_dev, dev_predictions)
dev_error = 1 - dev_accuracy
print(f"Development Set Error: {dev_error * 100:.2f}%")
print(f"Running Time: {end_time - start_time:.2f} seconds")


Development Set Error: 26.00%
Running Time: 39.39 seconds
