Assignment Week 3;

Author: Kannur, Gyan;

Date: 03/25/2025

In [1]:
# import required libraries
import pandas as pd
from textblob import TextBlob
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import string
import warnings
warnings.filterwarnings("ignore")

In [4]:
# import data
train_data = pd.read_csv('./assignments-data/labeledTrainData.tsv',  sep='\t')

# Display the first few rows of the DataFrame
print(train_data.head())

       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


In [6]:
# 2. Counting positive and negative reviews
positive_reviews = train_data[train_data['sentiment'] == 1]
negative_reviews = train_data[train_data['sentiment'] == 0]
print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")

Number of positive reviews: 12500
Number of negative reviews: 12500


In [7]:
# 3. Using TextBlob to classify each movie review
def classify_sentiment_textblob(review):
    analysis = TextBlob(review)
    if analysis.sentiment.polarity >= 0:
        return 1
    else:
        return 0

train_data['predicted_sentiment_textblob'] = train_data['review'].apply(classify_sentiment_textblob)

In [8]:
# 4. Checking accuracy of the TextBlob model
textblob_accuracy = accuracy_score(train_data['sentiment'], train_data['predicted_sentiment_textblob'])
print("Accuracy of TextBlob model:", textblob_accuracy)

Accuracy of TextBlob model: 0.68524


##Conclusion:
This is slightly better than random guessing (>50%)

In [9]:
# 5. Using VADER sentiment analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import nltk
nltk.download('vader_lexicon')

def classify_sentiment_vader(review):
    analyzer = SentimentIntensityAnalyzer()
    compound_score = analyzer.polarity_scores(review)['compound']
    if compound_score >= 0:
        return 1
    else:
        return 0



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gyanr\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
train_data['predicted_sentiment_vader'] = train_data['review'].apply(classify_sentiment_vader)
vader_accuracy = accuracy_score(train_data['sentiment'], train_data['predicted_sentiment_vader'])
print(f"Accuracy of VADER model:{vader_accuracy}")

Accuracy of VADER model: 0.69356


## Part 2: Prepping Text for a Custom Model

In [11]:
# 2.1 Convert all text to lowercase letters: Used lambda
train_data['review'] = train_data['review'].apply(lambda x: x.lower())
train_data['review'].head()

0    with all this stuff going down at the moment w...
1    \the classic war of the worlds\" by timothy hi...
2    the film starts with a manager (nicholas bell)...
3    it must be assumed that those who praised this...
4    superbly trashy and wondrously unpretentious 8...
Name: review, dtype: object

In [12]:
# 2.2 Remove punctuation and special characters from the text
train_data['review'] = train_data['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
train_data['review'].head()

0    with all this stuff going down at the moment w...
1    the classic war of the worlds by timothy hines...
2    the film starts with a manager nicholas bell g...
3    it must be assumed that those who praised this...
4    superbly trashy and wondrously unpretentious 8...
Name: review, dtype: object

In [13]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gyanr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gyanr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [14]:
# 2.3 Remove stop words
stop_words = set(stopwords.words('english'))
train_data['review'] = train_data['review'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\gyanr/nltk_data'
    - 'C:\\Users\\gyanr\\gyan-python-workspace\\jup-workspace\\venv\\nltk_data'
    - 'C:\\Users\\gyanr\\gyan-python-workspace\\jup-workspace\\venv\\share\\nltk_data'
    - 'C:\\Users\\gyanr\\gyan-python-workspace\\jup-workspace\\venv\\lib\\nltk_data'
    - 'C:\\Users\\gyanr\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [21]:
# 2.4 Apply NLTK’s PorterStemmer
stemmer = PorterStemmer()
train_data['review'] = train_data['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\gyanr/nltk_data'
    - 'C:\\Users\\gyanr\\gyan-python-workspace\\jup-workspace\\venv\\nltk_data'
    - 'C:\\Users\\gyanr\\gyan-python-workspace\\jup-workspace\\venv\\share\\nltk_data'
    - 'C:\\Users\\gyanr\\gyan-python-workspace\\jup-workspace\\venv\\lib\\nltk_data'
    - 'C:\\Users\\gyanr\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [22]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gyanr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [23]:
# 2.4 Apply NLTK’s PorterStemmer :Retry after download of pukt_tab
stemmer = PorterStemmer()
train_data['review'] = train_data['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

In [17]:
# 2.4.1 Create a bag-of-words matrix
count_vectorizer = CountVectorizer()
bag_of_words_matrix = count_vectorizer.fit_transform(train_data['review'])

In [18]:
# 2.4.2 Display dimensions of bag-of-words matrix
print("Dimensions of bag-of-words matrix:", bag_of_words_matrix.shape)

Dimensions of bag-of-words matrix: (25000, 120674)


In [19]:
# 2.4.3 Create a term frequency-inverse document frequency (tf-idf) matrix
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(bag_of_words_matrix)

In [20]:
# 2.4.4 Display dimensions of tf-idf matrix
print("Dimensions of tf-idf matrix:", tfidf_matrix.shape)

Dimensions of tf-idf matrix: (25000, 120674)


## Observations:

I faced issues with punkt and got help from team's channel for proper imports. This has been a challenging assignment.
