## Data collection

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score
from nltk.stem import WordNetLemmatizer
import pickle
import spacy
import en_core_web_sm
import pickle

In [None]:
nlp = en_core_web_sm.load()

In [None]:
dataset = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])

## Data Preprocessing

In [None]:
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # remove punctuations and convert to lowercase
    text = ''.join(c for c in text if c.isalnum() or c == ' ')
    text = text.lower()

    # remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)

    # lemmatize the words
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())

    return text

dataset['Comments'] = dataset['Comments'].apply(preprocess)

## Coreference resolution

In [None]:
def resolve_coreferences(doc):
    # Resolve coreferences using spaCy
    for token in doc:
        if token.dep_ == 'pron' and token._.in_coref:
            for cluster in token._.coref_clusters:
                if cluster.main.text.lower() != token.text.lower():
                    token.sent.merge(cluster.main)
    return doc

In [None]:
dataset['Comments'] = dataset['Comments'].apply(nlp)
dataset['Comments'] = dataset['Comments'].apply(resolve_coreferences)

In [None]:
dataset['Comments'] = dataset['Comments'].apply(lambda doc: ' '.join([token.text for token in doc]))


## Text classification

In [92]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [None]:

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['Comments'], dataset['Reviews'], test_size=0.2, random_state=42)

# Create bag of words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train Naive Bayes model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)

# Predict on test set
y_pred = nb_classifier.predict(X_test_bow)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9790462427745664


In [93]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       580
           1       0.97      0.99      0.98       804

    accuracy                           0.98      1384
   macro avg       0.98      0.98      0.98      1384
weighted avg       0.98      0.98      0.98      1384



## Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(dataset['Comments'])

# Cluster reviews using K-means
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_bow)



In [None]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(3):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

# Assign cluster labels to reviews
dataset['cluster'] = kmeans.labels_

Cluster 0:
 potter
 harry
 brokeback
 mountain
 love
 impossible
 mission
 movie
 like
 hate
Cluster 1:
 harry
 potter
 hate
 potter0i
 think
 evil
 nt
 people
 would
 say
Cluster 2:
 vinci
 da
 code
 awesome
 suck
 sucked
 love
 much
 know
 right


## Relevance ranking

In [None]:
# Create TF-IDF representation
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(dataset['Comments'])

# Get relevance scores for each review
scores = np.asarray(X_tfidf.mean(axis=1)).ravel().tolist()
dataset['score'] = scores

# Sort reviews by relevance score
dataset.sort_values('score', ascending=False, inplace=True)
data = dataset.drop('Reviews', axis=1)

In [None]:
data

Unnamed: 0,Comments,cluster,score
5028,favorite attraction reopened friday due stupid...,1,0.004069
6011,brokeback mountain sucked0i hate brokeback mou...,0,0.003624
3151,loved brokeback mountain have nt seen crash st...,0,0.003611
32,people say joe liked da vinci code you re hard...,2,0.003282
5029,hate harry potter0harry potter suck clit0somet...,0,0.002673
...,...,...,...
2060,harry potter gorgeous,0,0.000690
2131,harry potter gorgeous,0,0.000690
2136,harry potter gorgeous,0,0.000690
6106,disliked brokeback mountain,0,0.000690


In [None]:
# Map cluster labels to emotions
cluster_emotion_map = {
    0: 'bad',
    1: 'neutral',
    2: 'good'
}
data['emotion'] = data['cluster'].map(cluster_emotion_map)

In [None]:
data

Unnamed: 0,Comments,cluster,score,emotion
5028,favorite attraction reopened friday due stupid...,1,0.004069,neutral
6011,brokeback mountain sucked0i hate brokeback mou...,0,0.003624,bad
3151,loved brokeback mountain have nt seen crash st...,0,0.003611,bad
32,people say joe liked da vinci code you re hard...,2,0.003282,good
5029,hate harry potter0harry potter suck clit0somet...,0,0.002673,bad
...,...,...,...,...
2060,harry potter gorgeous,0,0.000690,bad
2131,harry potter gorgeous,0,0.000690,bad
2136,harry potter gorgeous,0,0.000690,bad
6106,disliked brokeback mountain,0,0.000690,bad
