In [2]:
# import libraries

import numpy as np 
import pandas as pd
import string

import nltk
from nltk import pos_tag, word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, pairwise
from sklearn.model_selection import KFold

from scipy.spatial.distance import hamming, euclidean, cityblock
from scipy.spatial import minkowski_distance
from scipy.spatial import distance_matrix
from google.colab import files

import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# Lemmatization and Stemming

#Returns the lemmatized word
def get_lemmatized_word(word):
  return WordNetLemmatizer().lemmatize(word)

#Returns the stemmed word using porter stemming
def porter_stem(word):
  return PorterStemmer().stem(word)

#Returns the stemmed word using snow ball stemming
def snowball_stem(word):
  return SnowballStemmer(language='english').stem(word)

#Returns the stemmed word using lancaster stemming
def lanca_stem(word):
  return LancasterStemmer().stem(word)

In [4]:
#Text Preprocessing using word tokenization. 
#We first separate the word from the text keeping only the alphanumeric word, 

"""Parameters: 
text: Text to tokenize
stop_words: list of words to remove from the text
stem: stemming the word if specified else no stemming 
isLemma: lemmatize the word if True 
pos_tagging: takes only adjective is True
"""

def tokenization_using_nltk(text, stop_words=[], stem='', islemma=False, pos_tagging=False):
  tokens = [word for word in word_tokenize(text) if word.isalpha() and word not in stop_words]
  if pos_tagging:
    tagged = nltk.pos_tag(tokens)
    tokens = [name for name , pos in tagged if pos in ['JJ', 'ADJ']]
  if islemma:
    tokens = [get_lemmatized_word(word) for word in tokens]
  if stem == 'lanca':
     tokens = [lanca_stem(word) for word in tokens]
  if stem == 'snow':
     tokens = [snowball_stem(word) for word in tokens]
  if stem == 'porter':
    tokens = [porter_stem(word) for word in tokens]
  return tokens

In [5]:
# method for KNN Implementation 


""" 
gets distance between based on distance metric
"""

def get_distance(test_data, train_data, distance_metric='cosine'):
  if distance_metric == 'manhattan':
    return manhattan_distances(test_data, train_data)
  if distance_metric == 'euclidian':
    return euclidean_distances(test_data, train_data)
  if distance_metric == 'cosine':
    return cosine_similarity(test_data, train_data)
  return cosine_similarity(test_data, train_data)

def cosine_similarity(test_data, train_data):
  cosineSimilarities = np.dot(test_data, np.transpose(train_data))
  similarities = cosineSimilarities.toarray()
  return similarities



""" 
gets the k nearest neighbour and takes the majority vote to predict the sentiment
"""

def get_nearest_neighbour(training_data_x, test_data, training_data_y, num_neignbour=3, distance_metric='cosine'):
  dist_mat = get_distance(test_data, training_data_x,  distance_metric=distance_metric)
  prediction_sentiment = list()
  for dist in dist_mat:
    k_neigh = np.argsort(-dist)[:num_neignbour]
    predicted_values = []
    for x in k_neigh:
      predicted_values.append(int(training_data_y[x]))
    if predicted_values.count(-1) > predicted_values.count(1):
      prediction_sentiment.append(-1)
    else:
      prediction_sentiment.append(1)
  
  return prediction_sentiment


""" 
gets the k nearest neighbour and takes the weighted vote to predict the sentiment
"""

def get_nearest_neighbour_weighted_voting(training_data_x, test_data, training_data_y, num_neignbour=3, distance_metric='cosine'):
  dist_mat = get_distance(test_data, training_data_x,  distance_metric='cosine')
  prediction_sentiment = list()
  for dist in dist_mat:
    k_neigh = np.argsort(-dist)[:num_neignbour]
    neg_weight = 0.0
    pos_weight = 0.0
    for x in k_neigh:
      if int(training_data_y[x]) == 1:
        pos_weight+=(1-dist[x])
      else:
        neg_weight+=(1-dist[x])
    if pos_weight > neg_weight:
      prediction_sentiment.append(1)
    else:
      prediction_sentiment.append(-1)
  return prediction_sentiment


In [6]:
# Method to process the text and return the processed text

def data_preprocess(text_data, stop_words=[], stem='', islemma=False):
  processed_reviews = []
  for i in range(len(text_data)):
    review_text = text_data[i].lower()
    tokens = tokenization_using_nltk(review_text, stop_words=stop_words, stem=stem, islemma=islemma)
    processed_reviews.append(" ".join(tokens))

  return processed_reviews

In [7]:
# Spliting data

def split_data_prediction_values(X, Y, test_size = 0.3):
  x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=test_size)
  return x_train,x_test,y_train,y_test

# TFIDF representation of the data  

def getTFIDRepresentation(X, test_X, features={'max_features': None, 'max_df': 1.0,
    'min_df': 1,
    'ngram_range': (1,1)}):
  tfidfconverter = TfidfVectorizer(max_features=features['max_features'], min_df=features['min_df'], max_df=features['max_df'], ngram_range=features['ngram_range'])
  x_train_bow = tfidfconverter.fit_transform(X)
  x_test_bow = tfidfconverter.transform(test_X)
  return x_train_bow, x_test_bow


In [8]:
# K-fold cross validation 

def k_fold_validation(X, Y, num_neignbour=3, fold=5, features={}, distance_metric='cosine'):
  kf = KFold(n_splits=fold, shuffle=True)
  avg_accu = 0.0
  for train, test in kf.split(X):
    x_train_bow, x_test_bow = getTFIDRepresentation(X[train],X[test],features=features)
    y_train_values = Y[train].values
    prediction =  get_nearest_neighbour_weighted_voting(x_train_bow, x_test_bow, y_train_values, num_neignbour=num_neignbour, distance_metric=distance_metric)
    avg_accu += accuracy_score(Y[test], prediction)
  
  return avg_accu/fold

In [9]:
# Experiment on training data 

# training and testing data url from git hub

train_data_url = 'https://raw.githubusercontent.com/ridwant/DataMinig/main/1661892619_92027_train_file.csv'
test_data_url = 'https://raw.githubusercontent.com/ridwant/DataMinig/main/1661892619_9579706_test_file.csv'

#load data using pandas data frame
train_df = pd.read_csv(train_data_url, usecols=[0,1], names=['rating', 'review'])
test_df = pd.read_csv(test_data_url, usecols=[0], names=['review'])

stop_words = ['a', 'an', 'the', 'be', 'to', 'of', 'and', 'in', 'that', 'have', 'i', 'it', 'this', 'that', 'for', 'on', 'with', 'he', 'as', 'she', 'you', 'youll', 'do', 'at', 'his', 'her', 'by', 'from', 'they', 'we', 'say', 'said', 'or', 'will', 'my', 
              'one', 'all', 'there', 'their', 'what', 'how', 'so', 'up', 'out', 'if', 'who', 'about', 'which', 'us']

In [10]:
#fianl prediction on test data sets 

# determined k value 

selected_k = 171

# determined features
features =  {
    'max_features': 50000,
    'max_df': 0.9,
    'min_df': 1,
    'ngram_range': (2,3)
}

processed_reviews = data_preprocess(train_df['review'], stop_words=stop_words, stem='porter', islemma=False)
processed_training_df = pd.DataFrame({'review': processed_reviews, 'rating': list(train_df['rating'])})

X = processed_training_df['review']
Y = processed_training_df['rating']

processed_test_reviews = data_preprocess(test_df['review'], stop_words=stop_words, stem='porter', islemma=False)
processed_test_df = pd.DataFrame({'review': processed_test_reviews})
test_X = processed_test_df['review']

final_x_train_bow, final_x_test_bow = getTFIDRepresentation(X,test_X,features=features)
y_train_values = Y.values

prediction =  get_nearest_neighbour(final_x_train_bow, final_x_test_bow, y_train_values, selected_k, 'cosine')

df = pd.DataFrame(prediction)
df.to_csv('final_submission_1.csv', encoding = 'utf-8-sig', index=False, header=None) 
files.download("final_submission_1.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>