## Installation of libraries

In [None]:
!pip install --quiet transformers==4.5.0
!pip install --quiet sentencepiece==0.1.95
!pip install --quiet textwrap3==0.9.2
!pip install --quiet nltk==3.2.5

time: 9.98 s (started: 2021-05-20 17:00:04 +00:00)


In [None]:
!pip install --quiet ipython-autotime
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.5 s (started: 2021-05-20 17:00:14 +00:00)


In [None]:
from nltk.corpus import stopwords

Extractive Summarization using Tf-idf

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



# Tokenization and preprocessing
sentences = nltk.sent_tokenize(text)
stop_words = set(stopwords.words("english"))

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
tfidf_matrix = vectorizer.fit_transform(sentences)

# Compute cosine similarity between sentences
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Sentence ranking (using average TF-IDF scores)
sentence_scores = similarity_matrix.mean(axis=1)

# Sort sentences by score and extract top sentences for summarization
num_sentences = 3  # Adjust the number of sentences for the summary
top_sentence_indices = sentence_scores.argsort()[-num_sentences:][::-1]
summary_sentences = [sentences[i] for i in top_sentence_indices]

# Print the extractive summary
print("Extractive Summary:")
for sentence in summary_sentences:
    print("-", sentence)


Extractive Summary:
- In a recent tweet,
Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
transaction, and hence was suspending vehicle purchases using the cryptocurrency.
- After saying that his electric vehicle-making company
Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
system transaction efficiency.
- The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin.


In [None]:
summary = '\n'.join(summary_sentences)


USING lstm

In [None]:
!pip install textwrap3

Collecting textwrap3
  Downloading textwrap3-0.9.2-py2.py3-none-any.whl (12 kB)
Installing collected packages: textwrap3
Successfully installed textwrap3-0.9.2


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Sample text data
text = summary

# Tokenization and preprocessing
tokens = text.split()
vocab = sorted(set(tokens))
word_to_index = {word: index for index, word in enumerate(vocab)}
index_to_word = {index: word for word, index in word_to_index.items()}
vocab_size = len(vocab)

# Generate training sequences
max_sequence_length = 50
sequences = []
for i in range(max_sequence_length, len(tokens)):
    sequence = tokens[i - max_sequence_length:i]
    sequences.append(sequence)

# Create input and target data
X = []
y = []
for sequence in sequences:
    X.append([word_to_index[word] for word in sequence[:-1]])
    y.append(word_to_index[sequence[-1]])
X = np.array(X)
y = np.array(y)

# Define LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_length-1),
    LSTM(100),
    Dense(vocab_size, activation='softmax')
])

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# Train model
model.fit(X, y, epochs=100, verbose=2)

# Function to generate question and answer
def generate_question_and_answer(text):
    # Tokenization and preprocessing
    input_sequence = text.split()[-(max_sequence_length-1):]
    input_sequence = [word_to_index.get(word, 0) for word in input_sequence]

    # Generate next word using trained model
    predicted_index = np.argmax(model.predict(np.array([input_sequence]))[0])
    predicted_word = index_to_word.get(predicted_index, "<UNK>")

    # Generate question and answer
    question = f"What is the {predicted_word.capitalize()}?"
    answer = f"The {predicted_word} is..."

    return question, answer

# Generate question and answer
question, answer = generate_question_and_answer(text)
print("Question:", question)
print("Answer:", answer)


Epoch 1/100
2/2 - 3s - loss: 4.2631 - 3s/epoch - 2s/step
Epoch 2/100
2/2 - 0s - loss: 4.2537 - 253ms/epoch - 126ms/step
Epoch 3/100
2/2 - 0s - loss: 4.2442 - 21ms/epoch - 11ms/step
Epoch 4/100
2/2 - 0s - loss: 4.2332 - 251ms/epoch - 126ms/step
Epoch 5/100
2/2 - 0s - loss: 4.2153 - 464ms/epoch - 232ms/step
Epoch 6/100
2/2 - 0s - loss: 4.1808 - 262ms/epoch - 131ms/step
Epoch 7/100
2/2 - 0s - loss: 4.0922 - 25ms/epoch - 12ms/step
Epoch 8/100
2/2 - 0s - loss: 3.9129 - 20ms/epoch - 10ms/step
Epoch 9/100
2/2 - 0s - loss: 3.8364 - 22ms/epoch - 11ms/step
Epoch 10/100
2/2 - 0s - loss: 3.8073 - 18ms/epoch - 9ms/step
Epoch 11/100
2/2 - 0s - loss: 3.7678 - 21ms/epoch - 11ms/step
Epoch 12/100
2/2 - 0s - loss: 3.7264 - 19ms/epoch - 9ms/step
Epoch 13/100
2/2 - 0s - loss: 3.6895 - 19ms/epoch - 10ms/step
Epoch 14/100
2/2 - 0s - loss: 3.6519 - 20ms/epoch - 10ms/step
Epoch 15/100
2/2 - 0s - loss: 3.6200 - 20ms/epoch - 10ms/step
Epoch 16/100
2/2 - 0s - loss: 3.5996 - 21ms/epoch - 11ms/step
Epoch 17/100
2/

In [None]:
from textwrap3 import wrap

text = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin.  In a recent tweet,
Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
transaction, and hence was suspending vehicle purchases using the cryptocurrency.  A day later he again tweeted saying, “To be clear, I strongly
believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”.  It triggered a downward spiral for Bitcoin value but
the cryptocurrency has stabilised since.   A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising
that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency."""

for wrp in wrap(text, 150):
  print (wrp)
print ("\n")

Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin.  In a recent tweet,
Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
transaction, and hence was suspending vehicle purchases using the cryptocurrency.  A day later he again tweeted saying, “To be clear, I strongly
believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”.  It triggered a down

# **Summarization with T5**

In [None]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary_model = summary_model.to(device)


time: 20.9 s (started: 2021-05-20 17:02:02 +00:00)


In [None]:
import random
import numpy as np

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

time: 3.71 ms (started: 2021-05-20 17:02:25 +00:00)


In [None]:
import nltk
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize

def postprocesstext (content):
  final=""
  for sent in sent_tokenize(content):
    sent = sent.capitalize()
    final = final +" "+sent
  return final


def summarizer(text,model,tokenizer):
  text = text.strip().replace("\n"," ")
  text = "summarize: "+text
  # print (text)
  max_len = 512
  encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=3,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  min_length = 75,
                                  max_length=300)


  dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
  summary = dec[0]
  summary = postprocesstext(summary)
  summary= summary.strip()

  return summary


summarized_text = summarizer(text,summary_model,summary_tokenizer)


print ("\noriginal Text >>")
for wrp in wrap(text, 150):
  print (wrp)
print ("\n")
print ("Summarized Text >>")
for wrp in wrap(summarized_text, 150):
  print (wrp)
print ("\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

original Text >>
Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin.  In a recent tweet,
Musk put out a statement from Tesla that it was “conce

# **Answer Span Extraction (Keywords and Noun Phrases)**

In [None]:
!pip install --quiet git+https://github.com/boudinfl/pke.git@dc4d5f21e0ffe64c4df93c46146d29d1c522476b
!pip install --quiet flashtext==2.7

  Building wheel for pke (setup.py) ... [?25l[?25hdone
time: 11.8 s (started: 2021-05-20 17:04:38 +00:00)


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import pke
import traceback

def get_nouns_multipartite(content):
    out=[]
    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=content)
        #    not contain punctuation marks or stopwords as candidates.
        pos = {'PROPN','NOUN'}
        #pos = {'PROPN','NOUN'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos, stoplist=stoplist)
        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        keyphrases = extractor.get_n_best(n=15)


        for val in keyphrases:
            out.append(val[0])
    except:
        out = []
        traceback.print_exc()

    return out

In [None]:
from flashtext import KeywordProcessor


def get_keywords(originaltext,summarytext):
  keywords = get_nouns_multipartite(originaltext)
  print ("keywords unsummarized: ",keywords)
  keyword_processor = KeywordProcessor()
  for keyword in keywords:
    keyword_processor.add_keyword(keyword)

  keywords_found = keyword_processor.extract_keywords(summarytext)
  keywords_found = list(set(keywords_found))
  print ("keywords_found in summarized: ",keywords_found)

  important_keywords =[]
  for keyword in keywords:
    if keyword in keywords_found:
      important_keywords.append(keyword)

  return important_keywords[:4]


imp_keywords = get_keywords(text,summarized_text)
print (imp_keywords)


keywords unsummarized:  ['elon musk', 'bitcoin', 'dogecoin', 'statements', 'tesla', 'tweets', 'cryptocurrency', 'vehicle', 'musk', 'system transaction efficiency', 'currency market', 'month low', 'fuels', 'company', 'world']
keywords_found in summarized:  ['dogecoin', 'month low', 'world', 'tesla', 'company', 'system transaction efficiency', 'vehicle', 'cryptocurrency', 'musk', 'bitcoin']
['bitcoin', 'dogecoin', 'tesla', 'cryptocurrency']
time: 753 ms (started: 2021-05-20 17:05:48 +00:00)


# **Question generation with T5**

In [None]:
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_model = question_model.to(device)

time: 8.2 s (started: 2021-05-20 17:08:47 +00:00)


In [None]:
def get_question(context,answer,model,tokenizer):
  text = "context: {} answer: {}".format(context,answer)
  encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=72)


  dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question



for wrp in wrap(summarized_text, 150):
  print (wrp)
print ("\n")

for answer in imp_keywords:
  ques = get_question(summarized_text,answer,question_model,question_tokenizer)
  print (ques)
  print (answer.capitalize())
  print ("\n")


Musk tweeted that his electric vehicle-making company tesla will not accept payments in bitcoin because of environmental concerns. He also said that
the company was working with developers of dogecoin to improve system transaction efficiency. The world's largest cryptocurrency hit a two-month low,
while doge coin rallied by about 20 percent. Musk has in recent months often tweeted in support of crypto, but rarely for bitcoin.


What cryptocurrency did Musk rarely tweet about?
Bitcoin


What did Musk say he was working with to improve system transaction efficiency?
Dogecoin


What company did Musk say would not accept bitcoin payments?
Tesla


What has Musk often tweeted in support of?
Cryptocurrency


time: 1.04 s (started: 2021-05-20 17:09:00 +00:00)
