In [118]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re


#Read the input files and read every line
def loadData(trainingFile, testingFile):
    
    with open(trainingFile, "r") as fr1:
        trainFile = fr1.readlines()
    
    with open(testingFile, "r") as fr2:
        testFile = fr2.readlines()
    
    #Split each line in the two files into reviews and labels  
    train_sentiments_t = [x.split("\t", 1)[0] for x in trainFile]
    train_reviews_t = [x.split("\t", 1)[1] for x in trainFile]
    
    return train_reviews, testFile, train_sentiments


def clean(reviews):
    
    """Initialize an empty list to hold the clean reviews"""
    clean_train_reviews = []

    # Loop over each review in the list
    for index, review in enumerate(reviews):
        # Call the pre processer for each review, and add the result to the list of clean reviews
        clean_train_reviews.append(preProcess(review))
    
    return clean_train_reviews
 
def preProcess(rawReview):

    """Function to convert a raw review to a string of words
        Takes in a raw movie review as a single string to output a preprocessed movie review as a single string"""
    
    # 1. Remove HTML tags
    text_only = BeautifulSoup(rawReview).get_text()
    #
    # 2. Remove Email IDs, URLs and numbers
    noEmail = re.sub(r'([\w\.-]+@[\w\.-]+\.\w+)','',text_only)
    
    noUrl = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]| \
        [a-z0-9.\-]+[.][a-z]{2,4}/|[a-z0-9.\-]+[.][a-z])(?:[^\s()<>]+|\(([^\s()<>]+| \
        (\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))','', noEmail)
    
    #Emotional symbols may affect the meaning of the review
    smileys = """:-) :) :o) :D :-D :( :-( :o(""".split()
    smileyPattern = "|".join(map(re.escape, smileys))
    
    letters_only = re.sub("[^a-zA-Z" + smileyPattern + "]", " ", noUrl)
    #
    # 3. Convert to lower case and split into individual words
    words = letters_only.lower().split()     
    #
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words and also 3-letter words and Lemmatize the review
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = ''
    for word in words:
        if word not in stops and len(word) > 3:
        #if len(word) > 3:
            lemmatized_words += str(lemmatizer.lemmatize(word)) + ' '
    #
    # 6. Join the words back into one string separated by space and return the result.
    return lemmatized_words

def createTFIDFMatrices(train_data, test_data):
    """Takes in processed training and testing data, outputs respective L2-normalized sparse matrices with TF-IDF values"""
    
    vectorizer = TfidfVectorizer(norm = 'l2')
    
    train_matrix = vectorizer.fit_transform(train_data)
    
    #parameters generated from fit() method on train data applied upon model to generate transformed data set of test data
    test_matrix = vectorizer.transform(test_data)

    return train_matrix, test_matrix

def findSimilarities(train_matrix, test_matrix):
    """Takes in the entire training data and the testing data (both sparse matrices) and 
        gives the cosine similarity between the two as a numpy array.
        Numpy arrays are fastest to work with for sorting while finding nearest neighbors"""
    
    cosineSimilarities = np.dot(test_matrix, np.transpose(train_matrix))
    similarities = cosineSimilarities.toarray()
        
    return similarities

def findKNearest(similarity_vector, k):
    """Takes in the similarity vector (numpy array) and number of neighbors to find, to return the K Nearest Neighbors indices.
        The input array gets sorted in descending order and the first k indices returned.
        The argsort function has been used to preserve the indices of the training reviews so that their respective labels
        can be easily referenced in the training labels list"""
   
    return np.argsort(-similarity_vector)[:k]
     

def predict(nearestNeighbors, labels):
    """Takes in the list of K nearest Neighbors and the full training labels list, and 
        calculates the count of positive and negative reviews. 
        If positive reviews are more, then the test review is positive and vice-versa"""
    
    positiveReviewsCount = 0
    negativeReviewsCount = 0
    for neighbor in nearestNeighbors:
        if int(labels[neighbor]) == 1:
            positiveReviewsCount += 1
        else:
            negativeReviewsCount += 1
    if positiveReviewsCount > negativeReviewsCount:
        return 1
    else:
        return -1

In [101]:
#Read the training and the test data set and get 3 separate lists of training reviews, test reviews and training labels
train_reviews, test_reviews, train_sentiments = loadData('train.dat', 'test.dat')

#Pre-process both the training and the test data set
train_reviews = clean(train_reviews)
test_reviews = clean(test_reviews)

train_matrix, test_matrix = createTFIDFMatrices(train_reviews, test_reviews)

In [102]:
similarities = findSimilarities(train_matrix, test_matrix)

In [116]:
#Pass every row in the numpy array of similarities to predict the sentiment of every review

k = 300
test_sentiments = list()

for similarity in similarities:
    knn = findKNearest(similarity, k)
    prediction = predict(knn, train_sentiments)
    
    #To write to the list as +1 instead of just a 1 for positive reviews
    if prediction == 1:
        test_sentiments.append('+1')
    else:
        test_sentiments.append('-1')

In [117]:
#Write the result to a .dat file
output = open('output-k-300.dat', 'w')

output.writelines( "%s\n" % item for item in test_sentiments )

output.close()