# NLP Assignment 1 (40% of grade): Sentiment Analysis from Tweets

This coursework will involve you implementing functions for a text classifier, which you will train to identify the **sentiment expressed in a text** in a dataset of approx. 27,000 entries, which will be split into a 80%/20% training/test split. 

In this template you are given the basis for that implementation, though some of the functions are missing, which you have to fill in.

Follow the instructions file **NLP_Assignment_1_Instructions.pdf** for details of each question - the outline of what needs to be achieved for each question is as below.

You must submit all **ipython notebooks and extra resources you need to run the code if you've added them** in the code submission, and a **2 page report (pdf)** in the report submission on QMPlus where you report your methods and findings according to the instructions file for each question.

In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
import pandas as pd
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/shreyshrivastava/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

# Question 1: Input and Basic preprocessing (10 marks)

In [3]:
import re
def parse_data_line(data_line):
    #checking if the length of data is greater than or equal to 2
    if len(data_line) >= 2:
        labels= data_line[1]  #labels at index 1 of read data
        text_data = data_line[2]  #texts i n the index 2 of the read data
    else:
        labels, text_data = None, None  
    return labels, text_data


In [4]:
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Downloaded the required nltk modules because my laptop didnt have it earlier
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def pre_process(text):  # doing the lemmatization, removing stopwords, and extracting unigrams / bigrams
    # Converting everything to lower case for easier checking 
    text = text.lower()
    # Remove non-alphanumeric characters and split punctuation from words
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text)
    tokens = text.split()
    stp_words = set(stopwords.words('english'))
    lemm = WordNetLemmatizer()
    tokens = [lemm.lemmatize(token) for token in tokens if token not in stp_words] #this step is for lemmatisation   
    return tokens  

#implementation
text_to_check = "This is an example sentence."
tokens_implementation = pre_process(text_to_check)
print(tokens_implementation)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shreyshrivastava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shreyshrivastava/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['example', 'sentence', '.']


# Question 2: Basic Feature Extraction (20 marks)

In [5]:
global_feature_dict={}
def to_feature_vector(tokens):
    feature_vector = {}
    
    def add_feature(feature): #i added features tothe global feature dict
        # If the feature not in the global feature dictionary -->  i add it
        if feature not in global_feature_dict:
            global_feature_dict[feature] = len(global_feature_dict)    
        # Getting the index of the feature
        feature_index = global_feature_dict[feature]
        # Increment the count of the feature in the local feature vector
        feature_vector[feature_index] = feature_vector.get(feature_index, 0) + 1

    # for unigrams
    for word in tokens:
        add_feature(word)
    # for bigrams
    for i in range(len(tokens) - 1):
        # Created a tuple to represent the bigram
        bigram = (tokens[i], tokens[i + 1])
        # Add the bigram feature to the local feature vector
        add_feature(bigram)
    # Return the local feature vector for the current example
    return feature_vector

In [6]:
def train_classifier(data): # to train ourn model
    print("Training Classifier...")
    pipeline = Pipeline([('svc', LinearSVC(max_iter=10000, dual=True))]) #used linearSVC to train the model
    return SklearnClassifier(pipeline).train(data)

# Question 3: Cross-validation (20 marks)

In [8]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

def cross_validate(dataset, folds):
    results = {'precision': [], 'recall': [], 'f1': [], 'accuracy': []}
    fold_size = int(len(dataset) / folds) + 1
    
    for i in range(0, len(dataset), fold_size):
        # Splitting the data into training and testing sets for cross-validation
        test_set = dataset[i:i + fold_size]
        train_set = dataset[:i] + dataset[i + fold_size:]
        # Train the classifier on the training set
        model = train_classifier(train_set)
        # Get the features and labels from the test set
        test_features, test_labels = zip(*test_set)
        # Predicting the labels on the test set
        predicted_labels = predict_labels(test_features, model)
        # Evaluate the performance of our model
        precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predicted_labels, average='weighted')
        accuracy = np.mean(np.array(test_labels) == np.array(predicted_labels))
        # Store the results for this fold
        results['precision'].append(precision)
        results['recall'].append(recall)
        results['f1'].append(f1)
        results['accuracy'].append(accuracy)
    # Calculate and print the average results over all folds
    average_outputs = {
        "avg_precision" : np.mean(results['precision']),
        "avg_recall" : np.mean(results['recall']),
        "avg_f1" : np.mean(results['f1']),
        "avg_accuracy" : np.mean(results['accuracy'])}
    return average_outputs

In [9]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [10]:
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'sentiment-dataset.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 


# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 33540 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 33540 rawData, 26832 trainData, 6708 testData
Training Samples: 
26832
Features: 
348969


In [11]:
cross_validate(train_data, 10)  # will work and output overall performance of p, r, f-score when cv implemented

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


{'avg_precision': 0.8568994188126245,
 'avg_recall': 0.8586024918745642,
 'avg_f1': 0.856529943578581,
 'avg_accuracy': 0.8586024918745642}

# Questions 5: Optimising pre-processing and feature extraction (30 marks)

**Note:** it is advisable to implement question 5 in a separate notebook where you further develop the pre-processing and feature extraction functions you implemented above.

In [15]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

({148: 1, 2: 2, 2318: 1, 140003: 1, 140: 1, 290449: 1, 290450: 1, 290451: 1, 153: 2, 279: 1, 84155: 1, 7414: 1, 5395: 1, 50935: 1, 290452: 1, 290453: 1, 290454: 1, 290455: 1, 290456: 1, 290457: 1, 754: 1, 16771: 1, 290458: 1, 290459: 1, 246727: 1}, 'positive')
Training Classifier...
Done training!
Precision: 0.857634
Recall: 0.859123
F Score:0.857643


In [13]:
from nltk.corpus import stopwords

In [14]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/shreyshrivastava/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True