In [None]:
# Import libraries
import argparse
import time

# Define a class to store a single sentiment example
class SentimentExample:
    def __init__(self, words, label):
        self.words = words
        self.label = label

    def __repr__(self):
        return repr(self.words) + "; label=" + repr(self.label)

    def __str__(self):
        return self.__repr__()


# Reads sentiment examples in the format [0 or 1]<TAB>[raw sentence]; tokenizes and cleans the sentences.
def read_sentiment_examples(infile):
    f = open(infile, encoding='iso8859')
    exs = []
    for line in f:
            fields = line.strip().split(" ")
            label = 0 if "0" in fields[0] else 1
            exs.append(SentimentExample(fields[1:], label))
    f.close()
    return exs

#### *Load* the data

In [None]:
#Mount drive to access files in gdrive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
#YOU NEED TO CHANGE PATH OF train_file AND dev_file ACCORDING TO WHERE YOU STORE THEM IN YOUR gdrive.
#You can also check the video of Week 2 tutorial (May 17th Zoom recoding) by Punar (GA) where he explains how to do it.

#"TODO" change path for train_file
train_file = '/content/gdrive/MyDrive/Assignment 1/Data/train.txt'
#"TODO" change path for dev_file
dev_file = '/content/gdrive/MyDrive/Assignment 1/Data/dev.txt'

# Load the data from the files
train_exs = read_sentiment_examples(train_file)
dev_exs = read_sentiment_examples(dev_file)
n_pos = 0
n_neg = 0
for ex in train_exs:
    if ex.label == 1:
        n_pos += 1
    else:
        n_neg += 1
print("%d train examples: %d positive, %d negative" % (len(train_exs), n_pos, n_neg))
print("%d dev examples" % len(dev_exs))


In [None]:
# Bijection between objects and integers starting at 0. Useful for mapping
# labels, features, etc. into coordinates of a vector space.

# This class creates a mapping between objects (here words) and unique indices
# For example: apple->1, banana->2, and so on
class Indexer(object):
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}

    def __repr__(self):
        return str([str(self.get_object(i)) for i in range(0, len(self))])

    def __str__(self):
        return self.__repr__()

    def __len__(self):
        return len(self.objs_to_ints)

    # Returns the object corresponding to the particular index
    def get_object(self, index):
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]

    def contains(self, object):
        return self.index_of(object) != -1

    # Returns -1 if the object isn't present, index otherwise
    def index_of(self, object):
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]

    # Adds the object to the index if it isn't present, always returns a nonnegative index
    def add_and_get_index(self, object, add=True):
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]

### Define Logistic Regression model

In [None]:
# Import libraries
from collections import Counter
from typing import List
import numpy as np
import math

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Feature extraction base type. Takes an example and returns an indexed list of features.
class FeatureExtractor(object):
    # Extract features. Includes a flag add_to_indexer to control whether the indexer should be expanded.
    # At test time, any unseen features should be discarded, but at train time, we probably want to keep growing it.
    def extract_features(self, ex, add_to_indexer):
        raise Exception("Don't call me, call my subclasses")


# Extracts unigram bag-of-words features from a sentence. It's up to you to decide how you want to handle counts
class UnigramFeatureExtractor(FeatureExtractor):
    def __init__(self, indexer: Indexer):
        self.indexer = indexer
        self.sid = SentimentIntensityAnalyzer()

    def extract_features(self, ex, add_to_indexer=False):
        features = Counter()
        for w in ex.words:
            if (self.sid.polarity_scores(w)['compound'] >= 0.5) or (self.sid.polarity_scores(w)['compound'] <= -0.5):
                feat_idx = self.indexer.add_and_get_index(w) if add_to_indexer else self.indexer.index_of(w)
                if feat_idx != -1:
                    features[feat_idx] += 1.0
        return features

In [None]:
# Sentiment classifier base type
class SentimentClassifier(object):
    # Makes a prediction for the given
    def predict(self, ex: SentimentExample):
        raise Exception("Don't call me, call my subclasses")


# Always predicts the positive class
class AlwaysPositiveClassifier(SentimentClassifier):
    def predict(self, ex: SentimentExample):
        return 1

In [None]:
class LogisticRegressionClassifier(SentimentClassifier):
    def __init__(self, train_exs, feat_extractor: FeatureExtractor, num_iters=50, reg_lambda=0.0, learning_rate=0.2):
        # TODO: Initialize the logistic regression model
        
        # Arguments: feat_extractor is unigram, train_examples is train dataset
        # num_iters is the number of epochs, reg_lambda is the regularization parameter
        # learning_rate is the learning rate used in gradient descent
        
        # STEP 1: Define variables for weights and biases, and initialize them
        
        # STEP 2: Call the train() function. (This has already been done for you)
        
        ##### SOLUTION START #####
        self.feat_extractor = feat_extractor
        self.feature_size = len(feat_extractor.indexer.objs_to_ints)

        # self.w = np.random.randn(1, self.feature_size)
        self.w = np.zeros([1, self.feature_size])
        self.b = .1

        ##### SOLUTION END #####

        self.train(train_exs, num_iters, reg_lambda, learning_rate)


    def train(self, train_exs, num_iters=50, reg_lambda=0.0, learning_rate=0.1):
        # TODO: Function for training the logistic regression model
        
        # STEP 1: Write a 'for' loop which iterates over the dataset num_iters times
        
        # STEP 2: Write an inner 'for' loop for each step of gradient descent
        # Use a stochastic gradient descent
        
        # STEP 3: In each step of gradient descent apply the update rule
        # to weights and biases

        ##### SOLUTION START #####
        
        for itr in range(num_iters):
            losses = []
            for ex in train_exs:
                y_hat, x = self.predict(ex)
                y = ex.label
                loss = (y - y_hat)*(y - y_hat)
                dw = 2 * (y_hat - y)* np.transpose(x)
                db = (y_hat - y)
                self.w = self.w - learning_rate * dw
                self.b = self.b - learning_rate * db
                losses.append(loss)
            print ('Iteration : {0}, Loss : {1}'.format(itr, np.mean(losses)))

        ##### SOLUTION END #####

    def predict(self, ex):
        # TODO: Logistic regression model's prediction for a single example
        ##### SOLUTION START #####
        feature = self.feat_extractor.extract_features(ex, False)
        x = np.zeros([self.feature_size, 1])
        for (k, v) in feature.items():
            x[k] = v

        # x = (x-np.min(x)) / (np.max(x) - np.min(x))
        o = self.w.dot(x) + self.b
        return 1.0/(1.0 + np.exp(-o)), x
        ##### SOLUTION END #####

#### Training function for logistic regression



In [None]:
# Train a logsitic regression model on the given training examples using the given FeatureExtractor
def train_lr(train_exs, feat_extractor: FeatureExtractor, reg_lambda):
    # TODO: Function for training logistic regression model.
    # Populate the feature_extractor.
    # Initialize and return an object of instance LogisticRegressionClassifier
    
    ##### SOLUTION START #####
    for tx in train_exs:
        feat_extractor.extract_features(tx, True)

    return LogisticRegressionClassifier(train_exs, feat_extractor, num_iters=500, learning_rate=0.02)
    ##### SOLUTION END #####


In [None]:
# Main entry point for your modifications. Trains and returns one of several models depending on the options passed
def train_model(feature_type, model_type, train_exs, reg_lambda=0.0):
    
    # Initialize feature extractor
    if feature_type == "unigram":
        # Add additional preprocessing code here
        feat_extractor = UnigramFeatureExtractor(Indexer())
    else:
        raise Exception("Pass unigram")

    # Train the model
    if model_type == "AlwaysPositive":
        model = AlwaysPositiveClassifier()
    elif model_type == "LogisticRegression":
        model = train_lr(train_exs, feat_extractor, reg_lambda=reg_lambda)
    else:
        raise Exception("Pass AlwaysPositive or LogisticRegression")
    return model

### Functions for evaluating the model

In [None]:
# Evaluates a given classifier on the given examples
def evaluate(classifier, exs):
    return print_evaluation([ex.label for ex in exs], [classifier.predict(ex)[0] for ex in exs])


# Prints accuracy comparing golds and predictions, each of which is a sequence of 0/1 labels.
def print_evaluation(golds, predictions):
    for i in range(len(predictions)):
        if predictions[i] >= 0.5:
            predictions[i] = 1
        else:
            predictions[i] = 0
    print(predictions)
    print(golds)
    num_correct = 0
    num_pos_correct = 0
    num_pred = 0
    num_gold = 0
    num_total = 0
    if len(golds) != len(predictions):
        raise Exception("Mismatched gold/pred lengths: %i / %i" %
                        (len(golds), len(predictions)))
    for idx in range(0, len(golds)):
        gold = golds[idx]
        prediction = predictions[idx]
        if prediction == gold:
            num_correct += 1
        if prediction == 1:
            num_pred += 1
        if gold == 1:
            num_gold += 1
        if prediction == 1 and gold == 1:
            num_pos_correct += 1
        num_total += 1

    print("Accuracy: %i / %i = %.2f %%" %
          (num_correct, num_total,
           num_correct * 100.0 / num_total))
    return num_correct * 100.0 / num_total
    
# Evaluate on train and dev dataset
def eval_train_dev(model):
    print("===== Train Accuracy =====")
    train_acc = evaluate(model, train_exs)
    print("===== Dev Accuracy =====")
    eval_acc = evaluate(model, dev_exs)
    return [train_acc, eval_acc]

### Evaluating Model with Unigram Bag-of-Words Representation

In [None]:
# Evaluate logistic regression with unigram features
unigram_model = train_model('unigram', 'LogisticRegression', train_exs)
eval_train_dev(unigram_model)