In [22]:
%pip install textblob

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
# Importing the libra
import pandas as pd
from textblob import TextBlob
from sklearn.metrics import accuracy_score
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hello\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [56]:
# Loading the dataset
data = pd.read_csv("Data\labeledTrainData.tsv", sep='\t')

  data = pd.read_csv("Data\labeledTrainData.tsv", sep='\t')


In [53]:
data.shape

(25000, 6)

In [25]:
# Checking the first 5 rows of the dataset
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [26]:
# Checking the distribution of the sentiments
data['sentiment'].value_counts()


sentiment
1    12500
0    12500
Name: count, dtype: int64

The dataset contains equal distribution of reviews with positive and negetive sentiments

## Sentiment Analysis with TextBlob

In [27]:
# Function to calculate polarity and predict sentiment
def analyze_sentiment_textblob(text):
    polarity=TextBlob(text).sentiment.polarity
    if polarity >=0:
        return 1
    else:
        return 0

In [28]:
# Applying Vadar sentiment analysis
data['textblob_prediction']=data['review'].apply(analyze_sentiment_textblob)

In [29]:
data.head()

Unnamed: 0,id,sentiment,review,textblob_prediction
0,5814_8,1,With all this stuff going down at the moment w...,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,0,It must be assumed that those who praised this...,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0


In [30]:
# Accuracy of TextBlob Prediction
text_blob_accuracy=accuracy_score(data['sentiment'],data['textblob_prediction'])
print(f"TextBlob Prediction Accuracy: {text_blob_accuracy}")

TextBlob Prediction Accuracy: 0.68524


The TextBlob sentiment analyzer achieved an accuracy of 68.52% on the dataset. While this is better than random guessing (50%), there is room for improvement.

Sentiment Analysis with VADER

In [34]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hello\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [35]:
# initializing the VADER Sentiment intensity analyzer
sentiment_intensity_analyzer = SentimentIntensityAnalyzer()


In [38]:
# Function to calculate sentiment using VADER
def analyze_sentiment_vader(text):
    polarity=sentiment_intensity_analyzer.polarity_scores(text)['compound']
    if polarity >=0:
        return 1
    else:
        return 0

In [39]:
# Applying the VADER sentiment analysis
data['vader_prediction']=data['review'].apply(analyze_sentiment_vader)
data.head()

Unnamed: 0,id,sentiment,review,textblob_prediction,vader_prediction
0,5814_8,1,With all this stuff going down at the moment w...,1,0
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",1,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0,0
3,3630_4,0,It must be assumed that those who praised this...,1,0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0,1


In [45]:
vader_accuracy = accuracy_score(data['sentiment'], data['vader_prediction'])
print(f"VADER Sentiment Analysis Accuracy: {vader_accuracy}")

VADER Sentiment Analysis Accuracy: 0.69356


The VADER sentiment analyzer achieved an accuracy of 69.35% on the dataset. This is better than random guessing (50%) and also slightly better than TextBlob Sentiment Analyzer (68.52%).

# Part 2: Prepping Text for a Custom Model

In [46]:
# Importing necessary libraries
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# Downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hello\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [47]:
# Defining stopwords and stemmer
stop_words=set(stopwords.words('english'))
stemmer=PorterStemmer()

In [48]:
# Function to preprocess text
def preprocess_text(text):
    text= text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words=text.split()
    words=[word for word in words if word not in stop_words]
    words=[stemmer.stem(word) for word in words]
    return ' '.join(words)
    

In [49]:
# Applying the above function to the datafrane
data['processed_review']=data['review'].apply(preprocess_text)

In [50]:
# Creating Bag-of-words Matrix
from sklearn.feature_extraction.text import CountVectorizer
vectorizer =CountVectorizer()
bow_matrix=vectorizer.fit_transform(data['processed_review'])
print(f"Bag-of-words Matrix shape: {bow_matrix.shape}")

Bag-of-words Matrix shape: (25000, 92532)


In [54]:
# Creating TF-IDF Matrix
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer()
tfidf_matrix=tfidf_vectorizer.fit_transform(data['processed_review'])
print(tfidf_matrix.shape)

(25000, 92532)
