In [58]:
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [56]:
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
import re

# Read the Excel data into a DataFrame
df = pd.read_excel('Dataset.xlsx')
df.head()

Unnamed: 0,Year Level,College Department,Review,Sentiment
0,3rd Year,CSCS,It is very expensive. Not all the food is tast...,Negative
1,3rd Year,CSCS,The food square is overall a fine place to eat...,Neutral
2,3rd Year,CSCS,Overall eating experience is fine. There is a ...,Positive
3,3rd Year,CSCS,Eating in the food square is a good and reliab...,Positive
4,3rd Year,CSCS,It is a bit expensive for an average quality o...,Neutral


In [61]:
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Initialize the Counter
cnt = Counter()

# Initialze the PorterStemmer
stemmer = PorterStemmer()


# Function for cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function for removing stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# Function for removing common words
cnt.most_common(10)
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

# Function for removing rare words
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

# Function for stemming
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

# Function for lemmatizing
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

# Apply Text Cleaning
df['Review'] = df['Review'].apply(clean_text)
df.dropna(inplace=True)

# Remove stopwords
df["Review_stop"] = df["Review"].apply(lambda text: remove_stopwords(text))

# Remove common words
df["Review_stopfreq"] = df["Review_stop"].apply(lambda text: remove_freqwords(text))

# Drop column
df.drop(["Review_stop"], axis=1, inplace=True)

# Remove rare words
df["Review_stopfreqrare"] = df["Review_stopfreq"].apply(lambda text: remove_rarewords(text))

# Drop columns
df.drop(["Review_stopfreq", "Review_stopfreqrare"], axis=1, inplace=True)

# Apply stemming
df["Review_stemmed"] = df["Review"].apply(lambda text: stem_words(text))

# Apply lemmatization
df["Review_lemmatized"] = df["Review"].apply(lambda text: lemmatize_words(text))


In [62]:
# Calculate compound sentiment score
df['Scores'] = df['Review'].apply(lambda r: sid.polarity_scores(r))
df['Compound'] = df['Scores'].apply(lambda c: c['compound'])
df['Comp_Score'] = df['Compound'].apply(lambda c: 'Negative' if c <= -0.05 else 'Positive' if c >= 0.05 else 'Neutral')

df.head()

Unnamed: 0,Year Level,College Department,Review,Sentiment,Review_stemmed,Review_lemmatized,Scores,Compound,Comp_Score
0,3rd Year,CSCS,it is very expensive not all the food is tasty...,Negative,it is veri expens not all the food is tasti an...,it be very expensive not all the food be tasty...,"{'neg': 0.0, 'neu': 0.872, 'pos': 0.128, 'comp...",0.2263,Positive
1,3rd Year,CSCS,the food square is overall a fine place to eat...,Neutral,the food squar is overal a fine place to eat a...,the food square be overall a fine place to eat...,"{'neg': 0.035, 'neu': 0.83, 'pos': 0.135, 'com...",0.4118,Positive
2,3rd Year,CSCS,overall eating experience is fine there is a l...,Positive,overal eat experi is fine there is a lot of va...,overall eating experience be fine there be a l...,"{'neg': 0.0, 'neu': 0.808, 'pos': 0.192, 'comp...",0.6124,Positive
3,3rd Year,CSCS,eating in the food square is a good and reliab...,Positive,eat in the food squar is a good and reliabl ro...,eat in the food square be a good and reliable ...,"{'neg': 0.0, 'neu': 0.805, 'pos': 0.195, 'comp...",0.7003,Positive
4,3rd Year,CSCS,it is a bit expensive for an average quality o...,Neutral,it is a bit expens for an averag qualiti of fo...,it be a bit expensive for an average quality o...,"{'neg': 0.0, 'neu': 0.823, 'pos': 0.177, 'comp...",0.6249,Positive


In [63]:
accuracy_score(df['Sentiment'], df['Comp_Score'])

0.5967741935483871

In [64]:
confusion_matrix(df['Sentiment'], df['Comp_Score'])

array([[15,  5, 11],
       [ 6,  8, 28],
       [ 0,  0, 51]])

In [65]:
print(classification_report(df['Sentiment'], df['Comp_Score']))

              precision    recall  f1-score   support

    Negative       0.71      0.48      0.58        31
     Neutral       0.62      0.19      0.29        42
    Positive       0.57      1.00      0.72        51

    accuracy                           0.60       124
   macro avg       0.63      0.56      0.53       124
weighted avg       0.62      0.60      0.54       124

