In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
nltk.download("stopwords")
nltk.download("vader_lexicon")

[nltk_data] Downloading package stopwords to C:\Users\Shivani
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Shivani
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Text Extraction

In [5]:
df = pd.read_csv("YoutubeCommentsDataSet.csv")

In [6]:
df.head(15)

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive
5,we’ve been hounding my bank to adopt apple pay...,neutral
6,we only got apple pay in south africa in 20202...,positive
7,for now i need both apple pay and the physical...,neutral
8,in the united states we have an abundance of r...,positive
9,in cambodia we have a universal qr code system...,neutral


In [7]:
df["Comment"] = df["Comment"].astype(str)

In [8]:
#removing stopwords as it will just add noise in the data for tfidf approach
stop_words = set(stopwords.words("english"))
#set(stopwords.words("english")) means coverting the list dtype into set ,why use set? because of its faster lookup rate
#when chechking if word in stop_words is much faster with a set than with a list.

## Text Preprocessing

In [9]:
# regex have functions for findall,searchmatch,split
# re.sub(pattern, replacement, string) search for all text that matches pattern and replace it with replacement.  
# in this [^a-z\s] :- '^' means negation
# did lowercasing before the re expression as it we would have removed capital letters too.
#created new column in last line
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # remove special chars & numbers
    text = " ".join([w for w in text.split() if w not in stop_words]) #list comprehension
    return text
df["cleaned_text"] = df["Comment"].apply(clean_text)

In [10]:
# sia is the object of class sentimentintensityanalizer
#How are polarity scores decided 
sia = SentimentIntensityAnalyzer()
print(len(sia.lexicon))        # how many words
print(list(sia.lexicon.items())[2000:4350])  # first 20 entries

7502
[('destructive', -3.0), ('destructively', -2.4), ('destructiveness', -2.4), ('destructivity', -2.2), ('destructs', -2.4), ('detached', -0.5), ('detain', -1.8), ('detained', -1.7), ('detention', -1.5), ('determinable', 0.9), ('determinableness', 0.2), ('determinably', 0.9), ('determinacy', 1.0), ('determinant', 0.2), ('determinantal', -0.3), ('determinate', 0.8), ('determinately', 1.2), ('determinateness', 1.1), ('determination', 1.7), ('determinations', 0.8), ('determinative', 1.1), ('determinatives', 0.9), ('determinator', 1.1), ('determined', 1.4), ('devastate', -3.1), ('devastated', -3.0), ('devastates', -2.8), ('devastating', -3.3), ('devastatingly', -2.4), ('devastation', -1.8), ('devastations', -1.9), ('devastative', -3.2), ('devastator', -2.8), ('devastators', -2.9), ('devil', -3.4), ('deviled', -1.6), ('devilfish', -0.8), ('devilfishes', -0.6), ('deviling', -2.2), ('devilish', -2.1), ('devilishly', -1.6), ('devilishness', -2.3), ('devilkin', -2.4), ('devilled', -2.3), ('de

In [11]:
# polarity score returns: A Python dict with 4 keys: {'neg': <float>, 'neu': <float>, 'pos': <float>, 'compound': <float>}
# used .tolist() to convert Series of dictionaries into a list of dictionaries pd.DataFrame() 
# can’t directly take a Series of dicts, but it can take a list of dicts and spread the keys into columns.
sia = SentimentIntensityAnalyzer()
vader_features = df["Comment"].apply(lambda x: sia.polarity_scores(x))
vader_df = pd.DataFrame(vader_features.tolist())

In [12]:
tfidf = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf.fit_transform(df["cleaned_text"])
tfidf_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 240119 stored elements and shape (18408, 5000)>

In [13]:
#Finding which words are kept
feature_names = tfidf.get_feature_names_out()
print(feature_names[2000:2050])   
print(len(feature_names))   

['hats' 'havent' 'hay' 'hd' 'head' 'headed' 'heading' 'headphones' 'heads'
 'healing' 'health' 'healthcare' 'healthy' 'hear' 'heard' 'hearing'
 'heart' 'hearts' 'heat' 'heaven' 'heavily' 'heavy' 'heck' 'held' 'hell'
 'hella' 'hello' 'help' 'helped' 'helpful' 'helping' 'helps' 'heres'
 'hermosa' 'hermoso' 'hero' 'heroes' 'hes' 'hey' 'hi' 'hidden' 'hide'
 'high' 'higher' 'highest' 'highlight' 'highlights' 'highly' 'highschool'
 'hikaru']
5000


In [14]:
from scipy.sparse import hstack
X = hstack([tfidf_features, vader_df.values])  # combine sparse + dense
y = df["Sentiment"]

In [15]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)

In [16]:
model = LogisticRegression(class_weight="balanced", max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [17]:
y_pred = model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[ 321  116   31]
 [ 159  637  132]
 [ 125  351 1810]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.53      0.69      0.60       468
     neutral       0.58      0.69      0.63       928
    positive       0.92      0.79      0.85      2286

    accuracy                           0.75      3682
   macro avg       0.67      0.72      0.69      3682
weighted avg       0.78      0.75      0.76      3682

Accuracy: 0.7517653449212385


In [18]:
def predict_sentiment(comment):
    # Clean the text (same preprocessing as before)
    cleaned = clean_text(comment)
    
    # TF-IDF features
    tfidf_feat = tfidf.transform([cleaned])
    
    # VADER features
    vader_feat = pd.DataFrame([sia.polarity_scores(comment)])
    
    # Combine TF-IDF + VADER
    features = hstack([tfidf_feat, vader_feat.values])
    
    # Predict sentiment
    prediction = model.predict(features)[0]
    probability=model.predict_proba(features)
    print(model.classes_)
    return prediction,probability

# Example usage
sample_comment = "I thought initially that this was a good product but after using this product it is terrible."
print("Comment:", sample_comment)
print("Predicted Sentiment:", predict_sentiment(sample_comment))

Comment: I thought initially that this was a good product but after using this product it is terrible.
['negative' 'neutral' 'positive']
Predicted Sentiment: ('negative', array([[0.85822411, 0.09857263, 0.04320326]]))


In [19]:
pip install keybert

Note: you may need to restart the kernel to use updated packages.


In [20]:
from keybert import KeyBERT

In [21]:
# Load model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [22]:
# Sample text
doc = """
TextRank is a graph-based algorithm used for keyword extraction.
KeyBERT uses BERT embeddings to generate the most relevant keywords.
It finds keywords that are semantically similar to the whole document.
"""

# Extract keywords (unigrams, bigrams, trigrams)
keywords = kw_model.extract_keywords(
    doc,
    keyphrase_ngram_range=(1, 3),   # unigrams, bigrams, trigrams
    stop_words='english',
    use_mmr=True,                   # to avoid redundancy
    diversity=0.7,                  # balance relevance & diversity
    top_n=5                         # top 5 keywords
)

print("Keywords:", keywords)

Keywords: [('keyword extraction keybert', 0.729), ('textrank graph based', 0.6471), ('embeddings generate relevant', 0.3188), ('semantically', 0.222), ('graph', 0.1458)]
