# Summarizer

In [8]:
# Load all the packages

import youtube_dl
import webvtt
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

import nltk
from nltk.corpus import stopwords
from nltk.cluster import cosine_distance

import heapq
import networkx as nx

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import glob
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Please enter a link
link = input("Please enter a url link:\n")
if not link:
    link = "https://www.youtube.com/watch?v=1jUmohFv_zw"
print("The link is: ", link)
print("Done, proceed further")

Please enter a url link:

The link is:  https://www.youtube.com/watch?v=1jUmohFv_zw
Done, proceed further


In [3]:
os.chdir(r"C:\Users\om20817\Documents\webscraping_sentiment_analysis")

# To avoid any error while parsing the video, we have created the MyLogger class
class MyLogger(object):
    def debug(self, msg):
        pass

    def warning(self, msg):
        pass

    def error(self, msg):
        print(msg)


# Dictionary to accept some inputs
ydl_opts = {
    "writesubtitles": True,
    "writeautomaticsub": True,
    "writeinfojson": True,
    "format": "bestaudio/best",
    "logger": MyLogger(),
    "keepvideo": False,
    "postprocessors": [
        {
            "key": "FFmpegExtractAudio",
            "preferredcodec": "wav",
            "preferredquality": "192",
        }
    ],
    "postprocessor_args": ["-ar", "16000"],
}


# ------------------------------------------------------------------
# Ignore below 2 lines
# with youtube_dl.YoutubeDL(youtube_dl_options) as youtube_dl_client:
#    youtube_dl_client.download([link])
# ----------------------------------------------------------------------

# Extract the audio from the video link along with the subtitles as text
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    meta = ydl.extract_info(link, download=True)

print("Finished extracting audio and subtitles as text!")
print("Please proceed with extracting the below video contents!")
print()
print(
    """ 'uploader','uploader_url','upload_date','creator','title','description','categories',
      'duration','view_count', 'like_count', 'dislike_count','average_rating','start_time', 'end_time',
      'release_date', 'release_year'"""
)

Finished extracting audio and subtitles as text!
Please proceed with extracting the below video contents!

 'uploader','uploader_url','upload_date','creator','title','description','categories',
      'duration','view_count', 'like_count', 'dislike_count','average_rating','start_time', 'end_time',
      'release_date', 'release_year'


In [9]:
keys = [
    "uploader",
    "uploader_url",
    "upload_date",
    "creator",
    "title",
    "description",
    "categories",
    "duration",
    "view_count",
    "like_count",
    "dislike_count",
    "average_rating",
    "start_time",
    "end_time",
    "release_date",
    "release_year",
]

filtered_d = dict((k, meta[k]) for k in keys if k in meta)
df = pd.DataFrame.from_dict(filtered_d, orient="index").T
df.index = df["title"]

sub_titles = glob.glob('./*.en.vtt')
sub_titles[0]

vtt = webvtt.read(sub_titles[0])

# Store the starting and ending point of each sentence as part of the start and end list
start_list = list()
end_list = list()

# Storing all the lines as part of the lines list
lines = []

for x in range(len(vtt)):
    start_list.append(vtt[x].start)
    end_list.append(vtt[x].end)

for line in vtt:
    lines.append(line.text.strip().splitlines())

lines = [" ".join(item) for item in lines]

final_df = pd.DataFrame({'Start_time':start_list,'End_time':end_list,'Statement':lines})
final_df['Statement'] = [w.replace('&gt;&gt;', '') for w in final_df['Statement']]
final_df['Statement'] = [w.replace(' &gt;&gt;', '') for w in final_df['Statement']]
final_df['Statement'] = [w.replace('&gt;&gt; Reporter:', '') for w in final_df['Statement']]

# Initialize the vader model
sid_obj = SentimentIntensityAnalyzer()

# Compute sentiment scores and labels
sentiment_scores_vader = [
    sid_obj.polarity_scores(article) for article in final_df.Statement
]

sentiment_category_positive = []
sentiment_category_neutral = []
sentiment_category_negative = []
sentiment_category_compound = []

for sentiments in sentiment_scores_vader:
    sentiment_category_positive.append(sentiments["pos"])
    sentiment_category_neutral.append(sentiments["neu"])
    sentiment_category_negative.append(sentiments["neg"])
    sentiment_category_compound.append(sentiments["compound"])


# Sentiment statistics per statement
sentiment_df = pd.DataFrame(
    [
        [article for article in final_df.Statement],
        sentiment_category_positive,
        sentiment_category_neutral,
        sentiment_category_negative,
        sentiment_category_compound,
    ]
).T

sentiment_df["Start_time"] = start_list
sentiment_df["End_time"] = end_list


sentiment_df.columns = [
    "Statement",
    "positive_polarity",
    "neutral_polarity",
    "negative_polarity",
    "overall_polarity",
    "Start_time",
    "End_time",
]


sentiment_df["Sentiment"] = [
    "Positive" if w > 0 else "Negative" if w < 0 else "Neutral"
    for w in sentiment_df["overall_polarity"]
]

## Method 1 - NLTK word frequency 

In [14]:
sentences = []  # Empty list to store all the sentences

for sentence in sentiment_df["Statement"]:
    sentences.append(sentence)

# Creating the article or corpus to perform summarization
article_text = "".join(sentences)

# Pre-processing
# Removing square brackets and extra spaces
article_text = re.sub(r"\[[0-9]*\]", " ", article_text)
article_text = re.sub(r"\s+", " ", article_text)

# Removing special characters and digits
formatted_article_text = re.sub(r"[^a-zA-Z]", " ", article_text)
formatted_article_text = re.sub(r"\s+", " ", formatted_article_text)

# Use the formatted article text to create weighted frequency for the words and replace these weighted frequencies with
# the words in the article text

# Convert text to sentences
sentence_list = nltk.sent_tokenize(article_text)

# Remove all the stopwords
stop_words = stopwords.words("english")
stop_words.append(('&gt;&gt;','&gt','&gt','&gt;','Reporter'))

# Find the weighted frequency of all the words
word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
    if word not in stop_words:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

# Lower case all the words i.e. keys in the word frequencies dictionary
word_frequencies = {k.lower(): v for k, v in word_frequencies.items()}

# Calculate the maxium word frequency
maximum_frequency = max(word_frequencies.values())

# Calculate word frequency for each word
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word] / maximum_frequency

# Calculate the overall score of the sentence
sentence_scores = {}  # key: sentences, values: scores of the sentences
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(" ")) < 10000:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

# Generating the final summary
n = 2  # The number of sentences to generate in the summary
summary_sentences = heapq.nlargest(n, sentence_scores, key=sentence_scores.get)
summary = " ".join(summary_sentences)
summary

'LUCKILY FOR ME, IT’S JUST A SHORT DELAY FORLUCKILY FOR ME, IT’S JUST A SHORT DELAY FOR NOW BUT THAT’S NOT THEJUST A SHORT DELAY FOR NOW BUT THAT’S NOT THE CASE FOR SO MANYNOW BUT THAT’S NOT THE CASE FOR SO MANY OTHERS.CASE FOR SO MANY OTHERS. Reporter: HOSPITALS DECIDING MANY CLINICAL Reporter: HOSPITALS DECIDING MANY CLINICAL TRIALS COULD EXPOSEDECIDING MANY CLINICAL TRIALS COULD EXPOSE VULNERABLE PATIENTSTRIALS COULD EXPOSE VULNERABLE PATIENTS AND HEALTH CAREVULNERABLE PATIENTS AND HEALTH CARE WORKERS TO COVID-19AND HEALTH CARE WORKERS TO COVID-19 BUT ANNA’S DOCTORWORKERS TO COVID-19 BUT ANNA’S DOCTOR WORRIES SHE DOESN’TBUT ANNA’S DOCTOR WORRIES SHE DOESN’T HAVE TIME TO SPARE.WORRIES SHE DOESN’T HAVE TIME TO SPARE.'

## Method 2 - Cosine Similarity and page rank

In [19]:
# Calculate Sentence Similarity


def read_article(colname='Statement'):
    sentences = []
    for sentence in sentiment_df[colname]:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))


def sentence_similarity(sent1, sent2, stopwords=None):
    stop_words = stopwords.words("english")
    stop_words.append(("&gt;&gt;", "&gt", "&gt", "&gt;", "Reporter"))
    if stop_words is None:
        stop_words = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # Build the vector for the first sentence
    for w in sent1:
        if w in stop_words:
            continue
        vector1[all_words.index(w)] += 1

    # Build the vector for the second sentence
    for w in sent2:
        if w in stop_words:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)


def build_similarity_matrix(sentences, stop_words):

    # Create an empty similarity matrix of n*n sentence size in video
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                # ignore if both are same sentences
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(
                sentences[idx1], sentences[idx2], stopwords
            )
    return similarity_matrix


# Generate similarity matrix across sentences
sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)

# Rank sentences in similarity matrix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)

# Calculate the scores
scores = nx.pagerank(sentence_similarity_graph)

# Sort the rank and pick top sentences
ranked_sentence = sorted(
    ((scores[i], s) for i, s in enumerate(sentences)), reverse=True
)

# Empty list to save the summarize text
summarize_text = []

n = 5
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

summarize_text

['A F T E R   L E A R N I N G   H E R   L A S T   H O P E   T O   B E A T   S T A G E   F O U R   C O L O N',
 'T H A T   T R E A T M E N T   N O W .     R e p o r t e r :   H O S P I T A L S   A R E   A S K I N G   M A N Y   T O   D O',
 'L A S T   H O P E   T O   B E A T   S T A G E   F O U R   C O L O N   C A N C E R   I S   B E I N G',
 'A N D   F O R   T H O S E   P E O P L E   T O   B E   S O   I R R E S P O N S I B L E   A N D   T O   D R I N K   I S   M O R E',
 'B U T   A N N A ’ S   D O C T O R   W O R R I E S   S H E   D O E S N ’ T   H A V E   T I M E   T O   S P A R E .']

## Method 3 - Gensim summarization

In [22]:
sentences = []
for sentence in sentiment_df['Statement']:
    sentences.append(sentence)
    
paragraph = ''.join(sentences)


# Method a
# Summary of 5% of the original content
summ_per = summarize(paragraph,ratio=0.5)
#print("Percent Summary")
#print(summ_per)

# Method b
# Summary 200 words
summ_words = summarize(paragraph,word_count=100)
print("Word summary")
print(summ_words)

Word summary
Reporter: NEARLY 2 MILLION AMERICANS ARE Reporter: NEARLY 2 MILLION AMERICANS ARE DIAGNOSED WITH CANCERMILLION AMERICANS ARE DIAGNOSED WITH CANCER EVERY YEAR.DIAGNOSED WITH CANCER EVERY YEAR.
Reporter: HOSPITALS ARE ASKING MANY TO DO Reporter: HOSPITALS ARE ASKING MANY TO DO APPOINTMENTS ONLINE,ARE ASKING MANY TO DO APPOINTMENTS ONLINE, SKIP FOLLOWUP VISITSAPPOINTMENTS ONLINE, SKIP FOLLOWUP VISITS AND IN SOME CASESSKIP FOLLOWUP VISITS AND IN SOME CASES DELAY TREATMENTS.AND IN SOME CASES DELAY TREATMENTS.
FOR 35-YEAR-OLD ANNA, IT’S ESPECIALLY HARDFOR 35-YEAR-OLD ANNA, IT’S ESPECIALLY HARD SEEING OTHER YOUNGIT’S ESPECIALLY HARD SEEING OTHER YOUNG PEOPLE IGNORING THESEEING OTHER YOUNG PEOPLE IGNORING THE GUIDANCE ON SOCIALPEOPLE IGNORING THE GUIDANCE ON SOCIAL DISTANCING.GUIDANCE ON SOCIAL DISTANCING.
