In [3]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt 
%matplotlib inline
import numpy as np
import pandas as pd
import re
import nltk as nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rachanathota/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rachanathota/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachanathota/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rachanathota/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Read the input files

with open('test_file.txt', "r") as fr2:
        test_data = fr2.readlines()
        
train_data = pd.read_csv ('train_file.txt', sep="\t")
 

In [5]:
train_review = train_data.review

In [6]:
#get the stop words
stopwordsDictionary = stopwords.words('english')

In [7]:
#pre-processing the data. The function to convert a raw review into string of words.
def preprocessor(Sentences):
    processed_features = []
    for sentence in range(0, len(Sentences)):
        processed_feature = re.sub(r'\W', ' ', str(Sentences[sentence])) #remove special characters
        processed_feature = re.sub(r'([\w\.-]+@[\w\.-]+\.\w+)','',processed_feature) #Remove Email ids
        processed_feature = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]| \
        [a-z0-9.\-]+[.][a-z]{2,4}/|[a-z0-9.\-]+[.][a-z])(?:[^\s()<>]+|\(([^\s()<>]+| \
        (\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))','', processed_feature) #remove urls
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I) #replace multiple spaces with single space
        processed_feature = processed_feature.lower() #convert into lower case
        processed_feature = re.compile(r'<[^>]+>').sub('', processed_feature) #remove html tags
        processed_feature=processed_feature.split()
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = ''
        for word in processed_feature:
            if word not in stopwordsDictionary and len(word) > 3:
                lemmatized_words += str(lemmatizer.lemmatize(word)) + ' '    
        processed_features.append(lemmatized_words)
    return processed_features


In [24]:
#pre-processing the train data
preprocessed_train_data = preprocessor(train_review)
preprocessed_train_data

['reviewer mentioned watching episode hooked right exactly happened first thing struck brutality unflinching scene violence right word trust show faint hearted timid show pull punch regard drug violence hardcore classic word called nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never away would main appeal show fact go show dare forget pretty picture painted mainstream audience forget charm forget romance mess around first episode ever struck nasty surreal ready watched developed taste accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order away well mannered middle class inmate turned prison bitch lack street skill prison experience watching become comfortable uncomfortable viewing thats touch darker side ',
 'wo

In [9]:
#pre-processing the test data
preprocessed_test_data = preprocessor(test_data)


In [10]:
#This function returns L2-normalized sparse matrices with TF-IDF values

def createTFIDFMatrices(train_data, test_data):
    vectorizer = TfidfVectorizer(norm = 'l2')
    train_matrix = vectorizer.fit_transform(preprocessed_train_data)
    test_matrix = vectorizer.transform(preprocessed_test_data)
    return train_matrix, test_matrix


In [11]:
#Tf-idf vectorizing the data

train_matrix, test_matrix = createTFIDFMatrices(train_review, test_data)


In [12]:
#this function computes cosine similarity between the two as a numpy array

def findSimilarities(train_matrix, test_matrix):
    cosineSimilarities = np.dot(test_matrix, np.transpose(train_matrix))
    similarities = cosineSimilarities.toarray()
    return similarities


In [13]:
#finding the similarities

similarities = findSimilarities(train_matrix, test_matrix)
len(similarities)


15000

In [14]:
#this function returns the K Nearest Neighbours

def findKNearest(similarity_vector, k):
    return np.argsort(-similarity_vector)[:k]


In [15]:
#this function calculates the count of positive and negative reviews

def predict(nearestNeighbors, labels):
    positiveReviews = 0
    negativeReviews = 0
    for neighbor in nearestNeighbors:
        if int(labels[neighbor]) == 1:
            positiveReviews += 1
        else:
            negativeReviews += 1
    if positiveReviews > negativeReviews:
        return 1
    else:
        return -1
    

In [16]:
#determining k splits in k-fold

kf = KFold(n_splits=5)
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [20]:
#this function calculates the score between the predicted sentiments and actual(y_test) sentiments

def calculate_score(X_train, X_test, y_train, y_test, k):
    
    train_matrix, test_matrix = createTFIDFMatrices(X_train, X_test)
    similarities = findSimilarities(train_matrix, test_matrix)
    sentiments = [];
    for similarity in similarities:
        knn = findKNearest(similarity, k)
        prediction = predict(knn, train_data.sentiment)
    
       #representing the list as +1 instead of just a 1 for positive reviews
        if prediction == 1:
            sentiments.append(1)
        else:
            sentiments.append(-1)
            
    count = 0
    for x, y in zip(sentiments, y_test):
        if x == y:
            count = count + 1
    return(count/len(y_test))


In [21]:
#k-fold cross validation with k=5 splits

scores_k = []
k_range = range(300, 400)
avg = []

for k in range(300, 400, 10):
    for train_index, test_index in kf.split(train_data):
        X_train, X_test, y_train, y_test = train_data.review[train_index], train_data.review[test_index],train_data.sentiment[train_index], train_data.sentiment[test_index]
        scores = calculate_score(X_train, X_test, y_train, y_test, k)
        print(scores)
        scores_k.append(scores)
    avg.append(sum(scores_k)/len(scores_k))
    
    

0.494
0.5126666666666667
0.5043333333333333
0.495
0.5048349449816606
0.49233333333333335
0.5116666666666667
0.504
0.49466666666666664
0.5071690563521174
0.49633333333333335


KeyboardInterrupt: 

In [22]:
#Pass every row in the numpy array of similarities to predict the sentiment of every review
k = 300
final_result = []

for i in similarities:
    knn = findKNearest(i, k)
    prediction = predict(knn, train_data.sentiment)
    
    #To write to the list as +1 instead of just a 1 for positive reviews
    if prediction == 1:
        final_result.append('+1')
    else:
        final_result.append('-1')

In [23]:
#Write the result to a .dat file
output = open('output.txt', 'w')

output.writelines( "%s\n" % item for item in final_result )

output.close()