Building a Rule-based Sentiment Classifier

In [2]:
def extract_features(x: str) -> dict[str, float]:
    features = {}
    x_split = x.split(' ')
    
    # Count the number of "good words" and "bad words" in the text
    good_words = ['love', 'good', 'nice', 'great', 'enjoy', 'enjoyed']
    bad_words = ['hate', 'bad', 'terrible', 'disappointing', 'sad', 'lost', 'angry']
    for x_word in x_split:
        if x_word in good_words:
            features['good_word_count'] = features.get('good_word_count', 0) + 1
        if x_word in bad_words:
            features['bad_word_count'] = features.get('bad_word_count', 0) + 1
    
    # The "bias" value is always one, to allow us to assign a "default" score to the text
    features['bias'] = 1
    
    return features

feature_weights = {'good_word_count': 1.0, 'bad_word_count': -1.0, 'bias': 0.5}

Data Reading 

In [2]:
def read_xy_data(filename):
    x_data = []
    y_data = []
    with open(filename,'r') as f:
        for line in f:
            label,text = line.strip().split("|||")
            x_data.apppend(text)
            y_data.append(int(label))
    return x_data,y_data

In [None]:
x_train,y_train = read_xy_data('train.txt')
x_test,y_test = read_xy_data('test.txt')

In [None]:
print(x_train[0])
print(y_train[0])

Run the Classifier and Calculate Accuracy 

In [5]:
def run_classifier(x):
    score = 0
    for feat_name,feat_value in extract_features(x).items():
        score += feature_weights.get(feat_name,0)*feat_value
    if score > 0:
        return 1 
    elif score < 0:
        return -1
    else:
        return 0


In [6]:
def Calculate_accuracy(x_data,y_data):
    total_number = 0
    correct_numnber = 0
    for x,y in zip(x_data,y_data):
        y_pred = run_classifier(x)
        total_number += 1
        if y_pred == y:
            correct_numnber += 1
    return correct_numnber/total_number

In [None]:
label_count = {}
for y in y_test:
    if y not in label_count:
        label_count[y] = 0
    label_count[y] += 1
print(label_count)

In [None]:
train_accuracy = Calculate_accuracy(x_train, y_train)
test_accuracy = Calculate_accuracy(x_test, y_test)
print(f'Train accuracy: {train_accuracy}')
print(f'Dev/test accuracy: {test_accuracy}')

Error Analysis 

In [None]:
import random
def find_errors(x_data, y_data):
    error_ids = []
    y_preds = []
    for i, (x, y) in enumerate(zip(x_data, y_data)):
        y_preds.append(run_classifier(x))
        if y != y_preds[-1]:
            error_ids.append(i)
    for _ in range(5):
        my_id = random.choice(error_ids)
        x, y, y_pred = x_data[my_id], y_data[my_id], y_preds[my_id]
        print(f'{x}\ntrue label: {y}\npredicted label: {y_pred}\n')

In [None]:
find_errors(x_train, y_train)