In [1]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import PorterStemmer
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt 
%matplotlib inline
import numpy as np
import pandas as pd
import re
import nltk as nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rachanathota/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rachanathota/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachanathota/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def textCleaning(text):
    
    # 1. Removing meta tags
    clean = re.compile('<.*?>')
    text_only = re.sub(clean, '', text)
    
    # 2. Removing invald characters or DATA
    
    only_text = re.sub(r'([\w\.-]+@[\w\.-]+\.\w+)','',text_only)
    
    validChars = re.sub(r'^https?:\/\/.*[\r\n]*', '', only_text, flags=re.MULTILINE)
    
    clean = re.compile('<.*?>')
    cleaned_html = re.sub(clean, '', validChars)
    
    # 3. Into lower case
    words = cleaned_html.lower() 
    
    # 4. Tokenizing
    words1 = words.split()
    
    # 5. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # 6. Remove stop words. Restrict length grater than 2. Lematize and Stem
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    cleaned_words = ''
    for word in words1:
        if word not in stops and len(word) > 2:
            lemma = str(lemmatizer.lemmatize(word))
            stem = ps.stem(lemma)
            cleaned_words += str(lemmatizer.lemmatize(stem))
            cleaned_words += ' '
    
    # 7. Returning the processed text
    return cleaned_words


In [3]:
def TFIDFVectorizing(train_data, test_data):
    
    vectorizer = TfidfVectorizer(norm = 'l2')
    
    train_matrix = vectorizer.fit_transform(train_data)
    test_matrix = vectorizer.transform(test_data)

    return train_matrix, test_matrix


In [4]:
def findSimilarities(train_matrix, test_matrix):
    """Takes in the entire training data and the testing data (both sparse matrices) and 
        gives the cosine similarity between the two as a numpy array.
        Numpy arrays are fastest to work with for sorting while finding nearest neighbors"""
    
    cosineSimilarities = np.dot(test_matrix, np.transpose(train_matrix))
    similarities = cosineSimilarities.toarray()
        
    return similarities

In [5]:
def findKNearest(similarity_vector, k):
    """Takes in the similarity vector (numpy array) and number of neighbors to find, to return the K Nearest Neighbors indices.
        The input array gets sorted in descending order and the first k indices returned.
        The argsort function has been used to preserve the indices of the training reviews so that their respective labels
        can be easily referenced in the training labels list"""
   
    return np.argsort(-similarity_vector)[:k]
     

In [6]:
def predict(nearestNeighbors, labels):
    """Takes in the list of K nearest Neighbors and the full training labels list, and 
        calculates the count of positive and negative reviews. 
        If positive reviews are more, then the test review is positive and vice-versa"""
    
    positiveReviewsCount = 0
    negativeReviewsCount = 0
    for neighbor in nearestNeighbors:
        if int(labels[neighbor]) == 1:
            positiveReviewsCount += 1
        else:
            negativeReviewsCount += 1
    if positiveReviewsCount > negativeReviewsCount:
        return 1
    else:
        return -1

In [7]:
#K-fold
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [None]:

#test_data = pd.read_csv('test_file.txt', sep="\n\n", engine='python') 
with open('test_file.txt', "r") as fr2:
        testFile = fr2.readlines()
        
train_data = pd.read_csv ('train_file.txt', sep="\t")
 

cleaned_train = [];
cleaned_test = []


for rev in train_data.review:
    
    cleaned_train.append(textCleaning(rev))
    
for rev in testFile:
    
    cleaned_test.append(textCleaning(rev))

    
cleaned_test


In [None]:
# TFIDF vectorizing
vectorizer = TfidfVectorizer(norm = 'l2')
    
tfidf_train= vectorizer.fit_transform(cleaned_train)
tfidf_test = vectorizer.transform(cleaned_test)

tfidf_test

In [None]:
similarities = findSimilarities(tfidf_train, tfidf_test)
len(similarities)

In [130]:
def get_score(X_train, X_test, y_train, y_test, k):
    test_sentiments = list()
    
    train_matrix, test_matrix = TFIDFVectorizing(X_train, X_test)
    similarities1 = findSimilarities(train_matrix, test_matrix)
    sents = [];
    correct = 0;
    for similarity in similarities1:
        knn = findKNearest(similarity, k)
        prediction = predict(knn, train_data.sentiment)
    
       #To write to the list as +1 instead of just a 1 for positive reviews
        if prediction == 1:
            sents.append(1)
        else:
            sents.append(-1)
    correct = 0
    for x, y in zip(sents, y_test):
        if x == y:
            correct = correct + 1
    return(correct/len(y_test))


In [None]:
scores_k = []
k_range = range(300, 400)
train1 = pd.read_csv ('train_file.txt', sep="\t")
avg1 = []
for k in range(300, 400, 10):
    for train_index, test_index in kf.split(train1):
        X_train, X_test, y_train, y_test = train1.review[train_index], train1.review[test_index],train1.sentiment[train_index], train1.sentiment[test_index]
        scores = get_score(X_train, X_test, y_train, y_test, k)
        scores_k.append(scores)
    avg1.append(sum(scores_k)/len(scores_k))


In [None]:

k_range = range(300, 315)
# plot to see clearly
plt.plot(k_range, scores_k)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [57]:
#Pass every row in the numpy array of similarities to predict the sentiment of every review
k = 300
final_result = []

for i in similarities:
    knn = findKNearest(i, k)
    prediction = predict(knn, train_data.sentiment)
    
    #To write to the list as +1 instead of just a 1 for positive reviews
    if prediction == 1:
        final_result.append('+1')
    else:
        final_result.append('-1')

In [58]:
#Write the result to a .dat file
output = open('output.txt', 'w')

output.writelines( "%s\n" % item for item in final_result )

output.close()