In [1]:
# Importing libraries

import pandas as pd

import nltk   # nltk is for natural language processing and computational linguistics
from nltk.corpus import stopwords   # corpus is a collection of authentic text or audio organized into datasets
from nltk.sentiment import SentimentIntensityAnalyzer    # To analyse sentiment
from sklearn.feature_extraction.text import CountVectorizer    # method to convert text to numerical data
from sklearn.decomposition import LatentDirichletAllocation    # explains a set of observations through unobserved groups, and each group explains why some parts of the data are similar

In [4]:
import pymongo

MONGO_DB_URL = "mongodb+srv://MongoDB:mongodb123@cluster0.i7o85x8.mongodb.net/?retryWrites=true&w=majority"
mongo_client = pymongo.MongoClient(MONGO_DB_URL)

def get_collection_as_dataframe(database_name:str,collection_name:str)->pd.DataFrame:
    """
    Description: This function return collection as dataframe
    =========================================================
    Params:
    database_name: database name
    collection_name: collection name
    =========================================================
    return Pandas dataframe of a collection
    """
    try:
        # Reading data from database
        df = pd.DataFrame(list(mongo_client[database_name][collection_name].find()))
        if "_id" in df.columns:
            df = df.drop("_id",axis=1)
        return df
    except Exception as e:
        raise e

In [5]:
DATABASE_NAME = 'NLP'
COLLECTION_NAME = 'youtubeComments'

# Reading data
df = get_collection_as_dataframe(database_name= DATABASE_NAME, collection_name= COLLECTION_NAME)
df

Unnamed: 0.1,Unnamed: 0,0
0,0,I just love how you keep reassuring us in the ...
1,1,He is that teacher which our education system ...
2,2,"And, could you also put up some material on ho..."
3,3,"HI sir ,thanks for sharing your knowledge it r..."
4,4,"i love how you spelling it, if we could build ..."
5,5,"I just can't thank you enough, words would not..."
6,6,At 13:00 you had mentioned about weakness of L...
7,7,Have you ever used spacy when it comes to NLP?...
8,8,Great work for humanity.. thanks krish.. hats ...
9,9,Can you please guide me on learning NLP with R...


In [6]:
nltk.download('stopwords')     # used to eliminate unimportant words (commonly used words)
nltk.download('punkt')    # a tokenizer that divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences.
nltk.download('vader_lexicon')    # is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media, and works well on texts from other domains.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [7]:
# Preprocess the comments o remove the unwanted words

stop_words = set(stopwords.words('english'))

In [8]:
# Word Frequency Analysis

def word_frequency_analysis(comments):
    """
    This function prints frequency of each word in a descending order
    """
    # Convert comments to strings and handle float values
    comments = [str(comment) if not pd.isnull(comment) else '' for comment in comments]

    # Combine all comments into a single string
    all_comments = ' '.join(comments)

    # Tokenize the comments
    tokens = nltk.word_tokenize(all_comments)

    # Filter out stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Calculate word frequencies
    word_freq = nltk.FreqDist(filtered_tokens)

    # Print the most common words
    print('Most common words:')
    for word, freq in word_freq.most_common(10):
        print(f'{word}: {freq}')

In [9]:
# Sentiment Analysis

def sentiment_analysis(comments):
    """
    This function prints the over all sentiment of the text
    """
    sid = SentimentIntensityAnalyzer()

    # Calculate sentiment scores for each comment
    sentiment_scores = [sid.polarity_scores(comment) for comment in comments]

    # Calculate average sentiment scores
    avg_sentiment = sum(score['compound'] for score in sentiment_scores) / len(sentiment_scores)

    print(f'Average sentiment: {avg_sentiment}')

In [10]:
# Topic Modeling

def topic_modeling(comments):
    """
    This function prints the most discussed topics in the comments
    """
    # Create a CountVectorizer object
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

    # Fit and transform the comments
    tf = tf_vectorizer.fit_transform(comments)

    # Create an LDA model
    lda = LatentDirichletAllocation(n_components=5, random_state=42)

    # Fit the LDA model
    lda.fit(tf)

    # Print the top words for each topic
    print('Top words per topic:')
    feature_names = tf_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
        print(f"Topic {topic_idx+1}: {' '.join(top_words)}")


In [11]:
# Columns of the data

df.columns

Index(['Unnamed: 0', '0'], dtype='object')

In [12]:
# Perform analysis on the comments

comments = df['0'].tolist()

In [13]:
# list of data

comments

['I just love how you keep reassuring us in the video that you got us covered from bottom to top. This is super helpful. Thank you',
 "He is that teacher which our education system genuinely needs . He's self taught hence that is visible in his lectures. Absolutely amazing !!",
 "And, could you also put up some material on how Hidden Markov Models are used in NLP? have studied them way back in 2011 during my Master's degree in the pre Deep Learning era. But don't have much practical exposure to NLP? And does acoustic model for phonemes recognition come more under speech Recognition? Could you also provide a short description on that?",
 'HI sir ,thanks for sharing your knowledge it really helps me alot sometime, i have a question. \nif LSTM has problem , why cant we directly use bidirectional LSTM instead of LSTM , can we skip LSTM and directly apply Bidirectional LSTM ?',
 'i love how you spelling it, if we could build such a pyramid for other techs, life of the learners would be much

In [14]:
# Word Frequency Analysis with count

word_frequency_analysis(comments)

Most common words:
.: 46
,: 44
?: 18
LSTM: 16
need: 12
): 10
used: 8
NLP: 8
really: 8
question: 8


In [15]:
# Sentiment Analysis

sentiment_analysis(comments)

Average sentiment: 0.3786899999999999


In [16]:
# Topic Modeling

topic_modeling(comments)

Top words per topic:
Topic 1: learning nlp deep used need recognition tell start playlist love
Topic 2: lstm really directly thanks sir hi based language libraries bidirectional
Topic 3: need lstm bank sentence river look word sentences work thanks
Topic 4: thanks thank just hats great able sir course lectures taught
Topic 5: question video krish using nlp like rasa tensorflow answering felt
