In [24]:
import pandas as pd
import nltk
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import re
from nltk.corpus import wordnet   

In [25]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
df = pd.read_csv('youtube_sentiment.csv')

In [27]:
comment_body = df['Comment']

In [28]:
comments=comment_body.to_numpy()

In [29]:
comments

array(['Just subscribed, downloading this and will watch in depth later. Excited to learn from what you guys have built',
       'Could you please do ERPnext tutorial ?',
       'I hate that this is where programming is headed. I’ve chosen the wrong career. So lame. ',
       ..., 'Cant wait to see browsers getting slower :D',
       'Great intro! This is a big step forward imho',
       'Damn Awesome Content 🔥'], dtype=object)

In [30]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun if the POS tag is not recognized

In [31]:
documents = []
lemmatizer = WordNetLemmatizer()


for sen in range(0, len(comments)):
    # Remove all the special characters except '?' and "'" mark
    document = re.sub(r'[^a-zA-Z0-9?!\' ]', ' ', str(comments[sen]))
    # remove all single characters from the middle
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)    
    # Converting to lowercase
    document = document.lower()
    document = re.sub("^\d+\s|\s\d+\s|\s\d+$", ' ', document)
    # Tokenize and perform POS tagging
    tokens = word_tokenize(document)
    pos_tags = pos_tag(tokens)
    # Lemmatization
    document = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
    document = ' '.join(document)
    documents.append(document)

In [32]:
df['Comment'] = documents

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
predicted_class = df['Predicted Class']
# Define the desired class labels and their corresponding encoded values
class_labels = ['Positive', 'Negative', 'Corrective', 'Interrogative', 'Neutral']
encoded_labels = [1, 2, 3, 4, 5]

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on the class labels and their encoded values
label_encoder.fit(class_labels)

# Map the predicted class to the corresponding encoded values
encoded_predicted_class = label_encoder.transform(predicted_class)

# Create a mapping dictionary for the encoded labels
label_mapping = dict(zip(class_labels, encoded_labels))

# Map the encoded values to the desired labels
mapped_predicted_class = [label_mapping[label] for label in predicted_class]

# Create a new column 'Label' based on the mapped predicted class
df['Label'] = mapped_predicted_class

In [35]:
df.to_csv('clean_dataset.csv', index=False)

In [36]:
df

Unnamed: 0,Comment,Predicted Class,Label
0,just subscribe download this and will watch in...,Positive,1
1,could you please do erpnext tutorial ?,Interrogative,4
2,i hate that this be where program be head ve c...,Negative,2
3,can you please share this use sharegpt ?,Interrogative,4
4,the fact this video be 3 hour when the non cha...,Negative,2
...,...,...,...
996,me start learn html cs and javascript so can m...,Neutral,5
997,basically blazor alternative that be create lo...,Neutral,5
998,cant wait to see browser get slow d,Negative,2
999,great intro ! this be big step forward imho,Positive,1


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [40]:
vectorizer = TfidfVectorizer(encoding='utf-8', stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
terms = vectorizer.get_feature_names() if hasattr(vectorizer, 'get_feature_names') else vectorizer.vocabulary_.keys()
print(len(terms))

2025


In [41]:
X.shape

(1001, 2025)

In [42]:
terms

dict_keys(['subscribe', 'download', 'watch', 'depth', 'later', 'excite', 'learn', 'guy', 'build', 'could', 'please', 'erpnext', 'tutorial', 'hate', 'program', 'head', 'choose', 'wrong', 'career', 'lame', 'share', 'use', 'sharegpt', 'fact', 'video', 'hour', 'non', 'chatgpt', 'one', 'around', 'kind', 'defeat', 'point', 'update', 'web', 'browser', 'people', 'need', 'course', 'ask', 'question', 'ah', 'man', 'well', 'good', 'hallucination', 'technical', 'background', 'work', 'fix', 'error', 'chat', 'gpt', 'waste', 'time', 'box', 'rock', 'nice', 'make', 'believe', 'world', 'still', 'place', 'linux', 'year', 'great', 'interest', 'appreciate', 'gj', 'much', 'beginner', 'add', 'english', 'subtitle', 'remove', 'bracket', 'timestamps', '00', 'youtube', 'detect', 'section', 'let', 'navigate', 'interface', 'may', 'also', 'millisecond', 'possible', 'create', 'bash', 'script', 'stop', 'start', 'tomcat', 'holly', 'service', 'reboot', 'server', 'ubuntu', 'cloud', 'deployment', 'debian', 'instead', 'win