In [None]:
!pip install kaggle

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install evaluate

In [None]:
!pip install rouge-score

In [None]:
!pip install py7zr

In [None]:
import pandas as pd
import numpy as np
import shutil

import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
from datasets import load_dataset
from rouge_score import rouge_scorer
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio
from IPython.display import display
from plotly.offline import init_notebook_mode

import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import shapiro, skew, anderson, kstest, gaussian_kde,spearmanr
import math

import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import pipeline
from transformers import DataCollatorForSeq2Seq
import tensorflow as tf
import evaluate

from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle_api/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

<div id = 'eda'
     style="font-family: Calibri, serif; text-align: left;">
    <hr style="border: none;
               border-top: 2.85px solid #041445;
               width: 100%;
               margin-top: 62px;
               margin-bottom: auto;
               margin-left: 0;">
    <div style="font-size: 56px; letter-spacing: 2.25px;color: #02011a;"><b>Exploring the Dataset</b></div>
</div>

In [None]:
!kaggle datasets download gowrishankarp/newspaper-text-summarization-cnn-dailymail


Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 98% 494M/503M [00:04<00:00, 104MB/s]
100% 503M/503M [00:04<00:00, 108MB/s]


In [None]:
!unzip /content/newspaper-text-summarization-cnn-dailymail.zip

Archive:  /content/newspaper-text-summarization-cnn-dailymail.zip
  inflating: cnn_dailymail/test.csv  
  inflating: cnn_dailymail/train.csv  
  inflating: cnn_dailymail/validation.csv  


In [None]:
cnn_train = pd.read_csv('/content/cnn_dailymail/train.csv')

In [None]:
train = load_dataset("xsum", split="train")
val = load_dataset("xsum", split="validation")
test = load_dataset("xsum", split="test")

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

The repository for xsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/xsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


(…)SUM-EMNLP18-Summary-Data-Original.tar.gz:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [None]:
# Configuring Pandas to exhibit larger columns
pd.set_option('display.max_colwidth', 1000)

In [None]:
def display_feature_list(features, feature_type):

    '''
    This function displays the features within each list for each type of data
    '''

    print(f"\n{feature_type} Features: ")
    print(', '.join(features) if features else 'None')

def describe_df(df):
    """
    This function prints some basic info on the dataset and
    sets global variables for feature lists.
    """

    global categorical_features, continuous_features, binary_features
    categorical_features = [col for col in df.columns if df[col].dtype == 'object']
    binary_features = [col for col in df.columns if df[col].nunique() <= 2 and df[col].dtype != 'object']
    continuous_features = [col for col in df.columns if df[col].dtype != 'object' and col not in binary_features]

    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} samples")
    print(f"\n{df.shape[1]:,.0f} attributes")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')
    print(f'\nData Types: \n{df.dtypes}')

    #negative_valued_features = [col for col in df.columns if (df[col] < 0).any()]
    #print(f'\nFeatures with Negative Values: {", ".join(negative_valued_features) if negative_valued_features else "None"}')

    display_feature_list(categorical_features, 'Categorical')
    display_feature_list(continuous_features, 'Continuous')
    display_feature_list(binary_features, 'Binary')

    print(f'\n{type(df).__name__} Head: \n')
    display(df.head(5))
    print(f'\n{type(df).__name__} Tail: \n')
    display(df.tail(5))

In [None]:
def histogram_boxplot(df,hist_color, box_color, height, width, legend, name):
    '''
    This function plots a Histogram and a Box Plot side by side

    Parameters:
    hist_color = The color of the histogram
    box_color = The color of the boxplots
    heigh and width = Image size
    legend = Either to display legend or not
    '''

    features = df.select_dtypes(include = [np.number]).columns.tolist()

    for feat in features:
        try:
            fig = make_subplots(
                rows=1,
                cols=2,
                subplot_titles=["Box Plot", "Histogram"],
                horizontal_spacing=0.2
            )

            density = gaussian_kde(df[feat])
            x_vals = np.linspace(min(df[feat]), max(df[feat]), 200)
            density_vals = density(x_vals)

            fig.add_trace(go.Scatter(x=x_vals, y = density_vals, mode = 'lines',
                                     fill = 'tozeroy', name="Density", line_color=hist_color), row=1, col=2)
            fig.add_trace(go.Box(y=df[feat], name="Box Plot", boxmean=True, line_color=box_color), row=1, col=1)

            fig.update_layout(title={'text': f'<b>{name} Word Count<br><sup><i>&nbsp;&nbsp;&nbsp;&nbsp;{feat}</i></sup></b>',
                                     'x': .025, 'xanchor': 'left'},
                             margin=dict(t=100),
                             showlegend=legend,
                             template = template,
                             #plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                             height=height, width=width
                            )

            fig.update_yaxes(title_text=f"<b>Words</b>", row=1, col=1, showgrid=False)
            fig.update_xaxes(title_text="", row=1, col=1, showgrid=False)

            fig.update_yaxes(title_text="<b>Frequency</b>", row=1, col=2,showgrid=False)
            fig.update_xaxes(title_text=f"<b>Words</b>", row=1, col=2, showgrid=False)

            fig.show()
            print('\n')
        except Exception as e:
            print(f"An error occurred: {e}")

In [None]:
def plot_correlation(df, title, subtitle, height, width, font_size):
    '''
    This function is resposible to plot a correlation map among features in the dataset.

    Parameters:
    height = Define height
    width = Define width
    font_size = Define the font size for the annotations
    '''
    corr = np.round(df.corr(numeric_only = True), 2)
    mask = np.triu(np.ones_like(corr, dtype = bool))
    c_mask = np.where(~mask, corr, 100)

    c = []
    for i in c_mask.tolist()[1:]:
        c.append([x for x in i if x != 100])



    fig = ff.create_annotated_heatmap(z=c[::-1],
                                      x=corr.index.tolist()[:-1],
                                      y=corr.columns.tolist()[1:][::-1],
                                      colorscale = colormap)

    fig.update_layout(title = {'text': f"<b>{title} Heatmap<br><sup>&nbsp;&nbsp;&nbsp;&nbsp;<i>{subtitle}</i></sup></b>",
                                'x': .025, 'xanchor': 'left', 'y': .95},
                    margin = dict(t=210, l = 110),
                    yaxis = dict(autorange = 'reversed', showgrid = False),
                    xaxis = dict(showgrid = False),
                    template = template,
                    #plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                    height = height, width = width)


    fig.add_trace(go.Heatmap(z = c[::-1],
                             colorscale = colormap,
                             showscale = True,
                             visible = False))
    fig.data[1].visible = True

    for i in range(len(fig.layout.annotations)):
        fig.layout.annotations[i].font.size = font_size

    fig.show()

In [None]:
def compute_tfidf(df_column, ngram_range=(1,1), max_features=15):
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english', ngram_range=ngram_range)
    x = vectorizer.fit_transform(df_column.fillna(''))
    df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
    return df_tfidfvect

<div id = 'train'
     style="font-family: Calibri, serif; text-align: left;">
    <hr style="border: none;
               width: 100%;
               margin-top: 62px;
               margin-bottom: auto;
               margin-left: 0;">
    <div style="font-size: 32px; letter-spacing: 2.25px;color: #02011a;"><b>Train Dataset</b></div>
</div>

In [None]:
# Extracting info on the training Dataframe
describe_df(cnn_train)


DataFrame shape: (287113, 3)

287,113 samples

3 attributes

Missing Data: 
id            0
article       0
highlights    0
dtype: int64

Duplicates: 0

Data Types: 
id            object
article       object
highlights    object
dtype: object

Categorical Features: 
id, article, highlights

Continuous Features: 
None

Binary Features: 
None

DataFrame Head: 



Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordain...","Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed ."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,"(CNN) -- Ralph Mata was an internal affairs lieutenant for the Miami-Dade Police Department, working in the division that investigates allegations of wrongdoing by cops. Outside the office, authorities allege that the 45-year-old longtime officer worked with a drug trafficking organization to help plan a murder plot and get guns. A criminal complaint unsealed in U.S. District Court in New Jersey Tuesday accuses Mata, also known as ""The Milk Man,"" of using his role as a police officer to help the drug trafficking organization in exchange for money and gifts, including a Rolex watch. In one instance, the complaint alleges, Mata arranged to pay two assassins to kill rival drug dealers. The killers would pose as cops, pulling over their targets before shooting them, according to the complaint. ""Ultimately, the (organization) decided not to move forward with the murder plot, but Mata still received a payment for setting up the meetings,"" federal prosecutors said in a statement. The comp...","Criminal complaint: Cop used his role to help cocaine traffickers .\nRalph Mata, an internal affairs lieutenant, allegedly helped group get guns .\nHe also arranged to pay two assassins in a murder plot, a complaint alleges ."
2,00027e965c8264c35cc1bc55556db388da82b07f,"A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years. Craig Eccleston-Todd, 27, was driving home from a night at a pub when he received a text message. As he was reading or replying to it, he veered across the road while driving round a bend and smashed into Rachel Titley’s car coming the other way. Craig Eccleston-Todd, 27 (left) was using his mobile phone when he crashed head-on into the car being driven by Rachel Titley, 28 (right). She died later from her injuries . The head-on crash took place in October 2013. Mr Eccleston-Todd's car was barely recognisable (pictured) Police said Eccleston-Todd had drunk at least three or four pints of beer before getting behind the wheel. He was found guilty of causing death by dangerous driving at Portsmouth Crown Court yesterday. Miss Titley, a 28-year-old solicitor’s clerk from Cowes, Isle of Wight, had also spent the evening with friends at a pub but had not drunk any alc...","Craig Eccleston-Todd, 27, had drunk at least three pints before driving car .\nWas using phone when he veered across road in Yarmouth, Isle of Wight .\nCrashed head-on into 28-year-old Rachel Titley's car, who died in hospital .\nPolice say he would have been over legal drink-drive limit at time of crash .\nHe was found guilty at Portsmouth Crown Court of causing death by dangerous driving ."
3,0002c17436637c4fe1837c935c04de47adb18e9a,"(CNN) -- With a breezy sweep of his pen President Vladimir Putin wrote a new chapter into Crimea's turbulent history, committing the region to a future returned to Russian domain. Sixty years prior, Ukraine's breakaway peninsula was signed away just as swiftly by Soviet leader Nikita Khrushchev. But dealing with such a blatant land grab on its eastern flank won't be anywhere near as quick and easy for Europe's 28-member union. Because, unlike Crimea's rushed referendum, everyone has a say. After initially slapping visa restrictions and asset freezes on a limited number of little known politicians and military men, Europe is facing urgent calls to widen the scope of its measures to target the Russian business community in particular. The logic of this is that those who run Russia and own it are essentially two sides of the coin. Alexei Navalny, one-time Moscow mayoral contender now under house arrest for opposing the current regime, called for Europe's leaders to ban everyone -- fro...","Nina dos Santos says Europe must be ready to accept sanctions will hurt both sides .\nTargeting Russia's business community would be one way of sapping their support for President Putin, she says .\nBut she says Europe would have a hard time keeping its factories going without power from the east ."
4,0003ad6ef0c37534f80b55b4235108024b407f0b,"Fleetwood are the only team still to have a 100% record in Sky Bet League One as a 2-0 win over Scunthorpe sent Graham Alexander’s men top of the table. The Cod Army are playing in the third tier for the first time in their history after six promotions in nine years and their remarkable ascent shows no sign of slowing with Jamie Proctor and Gareth Evans scoring the goals at Glanford Park. Fleetwood were one of five teams to have won two out of two but the other four clubs - Peterborough, Bristol City, Chesterfield and Crawley - all hit their first stumbling blocks. Posh were defeated 2-1 by Sheffield United, who had lost both of their opening contests. Jose Baxter’s opener gave the Blades a first-half lead, and although it was later cancelled out by Shaun Brisley’s goal, Ben Davies snatched a winner six minutes from time. In the lead: Jose Baxter (right) celebrates opening the scoring for Sheffield United . Up for the battle: Sheffield United's Michael Doyle (left) challenges Peter...","Fleetwood top of League One after 2-0 win at Scunthorpe .\nPeterborough, Bristol City, Chesterfield and Crawley all drop first points of the season .\nStand-in striker Matt Done scores a hat-trick as Rochdale thrash Crewe 5-2 .\nWins for Notts County and Yeovil .\nCoventry/Bradford and Oldham/Port Vale both end in draws .\nA late Stephen Bywater own goal denies Gillingham three points against Millwall ."



DataFrame Tail: 



Unnamed: 0,id,article,highlights
287108,fffdfb56fdf1a12d364562cc2b9b1d4de7481dee,"By . James Rush . Former first daughter Chelsea Clinton has indicated she could one day be open to running for office. The only child of former U.S. President Bill Clinton has said she is no longer able to completely rule out the possibility of making a political bid if, at some point, she did not support her local representatives. The 34-year-old suggested she may be open to running for a number of offices, from mayor to senator. Scroll down for video . Future bid: Former first daughter Chelsea Clinton has said she may one day consider running for office . Chelsea Clinton, pictured with her father, former U.S. President Bill Clinton, suggested she may one day be open to running for a number of offices, from mayor to senator . In an interview, Clinton told Fast Company how ever since being a child she had always been asked about going into politics, with the answer usually being a plain 'no'. But now, she said, the answer was not so clear. She said: 'I live in a city and a state an...",Chelsea Clinton said question of running for office is no longer 'visceral no'\nSays she has been asked about running for office ever since being a child .\nBut she no longer completely rules out the possibility of a political bid .
287109,fffeecb8690b85de8c3faed80adbc7a978f9ae2a,"An apologetic Vanilla Ice has given his first proper interview following his arrest this week, saying he is struggling to eat and sleep and that he has never felt worse. The rapper turned renovation show reality star - charged with burglary and grand theft after allegedly pilfering property from a foreclosed home in Palm Beach, Florida - seemed teary as he said he has barely eaten or slept since turning himself into police on Wednesday. 'I'm a family man, I've got kids, I'm sorry to everybody for whatever this is,' Van Winkle told WPBP News as he left Florida for Columbus, Ohio, on Friday. 'I really love the community, where I live here in Palm Beach, I do a lot of things... hopefully people can see who I am, the real me.' Scroll down for video . 'I'm sorry to everybody': Vanilla Ice gave his first proper interview since being released from lock up earlier this week while leaving Florida on Friday bound for Columbus Ohio. He was teary, apologetic and upset . 'Worst day': The 47-ye...","Vanilla Ice, 47 - real name Robert Van Winkle - was arrested in Lantana, Florida, on Wednesday and charged with burglary and grand theft .\nCops say he took furniture, a pool heater and bicycles from a vacant $1million home near a property he was working on for his reality show .\n'The Vanilla Ice Project' reinvented Van Winkle as a renovation king .\nHe allegedly told his crew he owned the home and sent them over twice to pilfer the property .\nThe items were later found inside his own house, according to authorities .\nHe says it was a 'misunderstanding' that was 'blown out of proportion'"
287110,ffff5231e4c71544bc6c97015cdb16c60e42b3f4,"America's most lethal sniper claimed he wished he'd 'killed more' Iraqis despite accumulating 160 confirmed kills - with the true number estimated to be as high as 255. Speaking just months before his untimely death, Chris Kyle insisted his only regret was that US lives were lost because he shot too few of the enemy. The late veteran's wartime exploits are portrayed in the divisive Oscar-tipped movie American Sniper, starring Bradley Cooper and directed by Clint Eastwood. Little regrets: In an interview three years ago America's most lethal sniper Chris Kyle claimed he wished he'd 'killed more' Iraqis . Speaking to Toby Harnden of The Sunday Times three years ago, Kyle was asked whether he regretted any of his kills while on tour. Replying in his characteristically soft Texan drawl, the former sniper, said: 'No, sir, not at all. 'To be honest with you, I wish I'd killed more because every kill saved American lives and that was what I was out there for.' Kyle was thrilled by the suc...",America's most lethal sniper made comment in interview three years ago .\nSaid only regret was that US lives were lost because he killed too few .\nFilm based on his book has been a box office hit but also divided opinion .\nTV host Bill Maher said film's popularity was down to 'psychopath' hero .
287111,ffff924b14a8d82058b6c1c5368ff1113c1632af,"By . Sara Malm . PUBLISHED: . 12:19 EST, 8 March 2013 . | . UPDATED: . 13:05 EST, 8 March 2013 . Israel has been invaded by a swarm of more than one million locusts, sending the nation into panic that the country could be plagued by the insects over the forthcoming Passover holiday. However, local chefs welcome the insects as the grasshopper-relatives are not only tasty, but also kosher. Said to taste like a mix between a sunflower seed and a baby shrimp, what may appear to be a plague, may in fact be the perfect Passover treat. Super swarm: More than one million locusts have crossed the border from Egypt to Israel . Crackdown: A man holds locusts as the swarm is tested on the Israeli border with Egypt . A swarm of an estimated 30 million . insects had been devastating crops in Egypt, putting authorities on high alert as sparrow-sized locusts began making their way . into Israel earlier this week. Reports suggest that the insects are . mainly concentrated in areas of southern Israe...",A swarm of more than one million has crossed border from Egypt to Israel .\nLocal chefs are cheering at the prospect of 'wild' locusts - which are kosher .\nMimicking one of the ten plagues of the Bible week before Passover .\nIsrael's Agriculture Ministry set up an emergency hotline and task force .
287112,ffffd563a96104f5cf4493cfa701a65f31b06abf,"(CNN)Former Florida Gov. Jeb Bush has decided to ""actively explore"" a presidential bid after conversations with his family over Thanksgiving, he said in a Facebook post Tuesday. While Bush had been making steps toward a presidential bid with recent speeches in early primary states and plans to release documents from his time period as governor, the timing of Tuesday's announcement was widely unexpected and will quicken the pace for some would-be Republican primary challengers. In the post, Bush writes that over the holiday he ""talked about the future of the nation"" with his family. ""As a result of these conversations and thoughtful consideration of the kind of strong leadership I think America needs, I have decided to actively explore the possibility of running for President of the United States,"" he said. Bush has been moving toward a run for months and in recent days sparked renewed speculation over his plans with a visit to the all-important early primary state of South Carolina...",Other 2016 hopefuls maintain that Bush's announcement wouldn't prevent them from running .\nBush posted Tuesday on his social media accounts that he's exploring a presidential bid .\nBush has been making moves toward a run -- particularly in the past 48 hours .\nDemocrats said that Bush's announcement was more about keeping his name in the news .


In [None]:
type(train)

In [None]:
categorical_features.remove('id')

In [None]:
df_text_lenght = pd.DataFrame() # Creating an empty dataframe
for feat in categorical_features: # Iterating through features --> Dialogue & Summary
    df_text_lenght[feat] = cnn_train[feat].apply(lambda x: len(str(x).split())) #  Counting words for each feature

# Plotting histogram-boxplot
histogram_boxplot(df_text_lenght,'#89c2e0', '#d500ff', 600, 1000, True, 'Train Dataset')

Unigrams Plot Corelation

In [None]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(cnn_train['article'])
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Train - Dialogue', 800, 800, 12)

In [None]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(cnn_train['highlights'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Train - Summary', 800, 800, 12)

Bigrams Plot Corelation

In [None]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(cnn_train['article'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Train - Dialogue', 800, 800, 12)

In [None]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(cnn_train['highlights'.fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Train - Summary', 800, 800, 12)

Trigrams Plot Corelation

In [None]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (3,3)) # Top 15 terms
x = vectorizer.fit_transform(cnn_train['article'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Trigrams', 'Train - Dialogue', 800, 800, 12)

In [None]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (3,3)) # Top 15 terms
x = vectorizer.fit_transform(cnn_train['highlights'.fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Trigrams', 'Train - Summary', 800, 800, 12)

In [None]:
def clean_tags(text):
    clean = re.compile('<.*?>') # Compiling tags
    clean = re.sub(clean, '', text) # Replacing tags text by an empty string

    # Removing empty dialogues
    clean = '\n'.join([line for line in clean.split('\n') if not re.match('.*:\s*$', line)])

    return clean

In [None]:
# Defining function to clean every text in the dataset.
def clean_df(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(clean_tags)
    return df

In [None]:
# Cleaning texts in all datasets
cnn_train = clean_df(cnn_train,['article', 'highlights'])


In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
# Instantiating Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

def preprocess_function(examples):
    inputs = [doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert the pandas DataFrame to a Hugging Face Dataset
cnn_train_dataset = Dataset.from_pandas(cnn_train)

# Tokenize the dataset
tokenized_cnn = cnn_train_dataset.map(preprocess_function, batched=True, remove_columns=cnn_train_dataset.column_names)

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset
import re
import nltk

def preprocess_function(examples):
    inputs = [doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_xsum_train  = train.map(preprocess_function, batched=True, remove_columns=train.column_names)
tokenized_xsum_val = val.map(preprocess_function, batched=True, remove_columns=test.column_names)
tokenized_xsum_test = test.map(preprocess_function, batched=True, remove_columns=val.column_names)

In [None]:
# Convert to TensorFlow datasets

import tensorflow as tf

def convert_to_tf_dataset(tokenized_dataset):
  def gen():
    for example in tokenized_dataset:
      yield {key: tf.constant(value) for key, value in example.items()}

  tf_dataset = tf.data.Dataset.from_generator(
      gen,
      output_signature={
          key: tf.TensorSpec(shape=(None,), dtype=tf.int32)
          for key in tokenized_cnn.features
      }
  )
  tf_dataset = tf_dataset.shuffle(buffer_size=10000)
  tf_dataset = tf_dataset.batch(32)
  tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
  return tf_dataset

tf_cnn = convert_to_tf_dataset(tokenized_cnn)
tf_xsum = convert_to_tf_dataset(tokenized_xsum_train)
tf_test = convert_to_tf_dataset(tokenized_xsum_test)
tf_val = convert_to_tf_dataset(tokenized_xsum_val)


In [None]:
# Evaluation function
def evaluate(dataset):
    all_predictions = []
    all_references = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
    for batch in dataset:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
        predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        labels = np.where(batch["labels"] != -100, batch["labels"], tokenizer.pad_token_id)
        references = tokenizer.batch_decode(labels, skip_special_tokens=True)

        for pred, ref in zip(predictions, references):
            scores = scorer.score(pred, ref)
            all_predictions.append({key: value.fmeasure for key, value in scores.items()})

    # Calculate average scores across all predictions
    avg_scores = {}
    for metric in all_predictions[0]:
        avg_scores[metric] = sum([pred[metric] for pred in all_predictions]) / len(all_predictions)

    return avg_scores

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


In [None]:
#Trainiong the model on CNN/Dily-Mail
epochs = 1
batch_size = 32

tf_cnn = tf_test.batch(batch_size)

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for step, batch in enumerate(tf_cnn):
        with tf.GradientTape() as tape:
            outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
            loss = outputs.loss

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        if step % 100 == 0:
            print(f"Step {step}, Loss: {loss.numpy()}")



In [None]:
# Freeze the base model layers for fine-tuning
for layer in model.layers[:-2]:  # Freeze all but last two layers
    layer.trainable = False

optimizer_ft = tf.keras.optimizers.Adam(learning_rate=1e-5)  # Lower learning rate for fine tuning
model.compile(optimizer=optimizer_ft)


In [None]:

# Fine-tuning on XSum
epochs = 1
for epoch in range(epochs):
    print(f"Fine-tuning epoch {epoch+1}/{epochs} on XSum")
    for batch in tf_xsum:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        with tf.GradientTape() as tape:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = loss_fn(labels=labels, logits=outputs.logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer_ft.apply_gradients(zip(gradients, model.trainable_variables))

    # Evaluate on validation set
    eval_scores = evaluate(tf_val)
    print(f"Evaluation metrics after epoch {epoch+1}: {eval_scores}")


In [None]:
# Evaluate on XSum test set
eval_scores = evaluate(tf_test)
print(f"Final evaluation scores on XSum test set: {eval_scores}")


In [None]:
model.save_pretrained("text_summarization")
tokenizer.save_pretrained("text_summariztion_tf")

('text_summariztion_tf/tokenizer_config.json',
 'text_summariztion_tf/special_tokens_map.json',
 'text_summariztion_tf/vocab.json',
 'text_summariztion_tf/merges.txt',
 'text_summariztion_tf/added_tokens.json',
 'text_summariztion_tf/tokenizer.json')

In [None]:
#save text_summarization and text_summariztion_tf to drive
import shutil

shutil.copytree('/content/text_summarization', '/content/drive/MyDrive/text_summarization')
shutil.copytree('/content/text_summariztion_tf', '/content/drive/MyDrive/text_summariztion_tf')

'/content/drive/MyDrive/text_summariztion_tf'

In [None]:
def create_summarization_pipeline(model_path, tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)

    pipe = pipeline("summarization", model=model, tokenizer=tokenizer)
    return pipe

model_path = "/content/drive/MyDrive/text_summarization"
tokenizer_path = "/content/drive/MyDrive/text_summariztion_tf"

summarizer = create_summarization_pipeline(model_path, tokenizer_path)


text = "A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years. Craig Eccleston-Todd, 27, was driving home from a night at a pub when he received a text message. As he was reading or replying to it, he veered across the road while driving round a bend and smashed into Rachel Titley’s car coming the other way. Craig Eccleston-Todd, 27 (left) was using his mobile phone when he crashed head-on into the car being driven by Rachel Titley, 28 (right). She died later from her injuries . The head-on crash took place in October 2013. Mr Eccleston-Todd's car was barely recognisable (pictured) Police said Eccleston-Todd had drunk at least three or four pints of beer before getting behind the wheel. He was found guilty of causing death by dangerous driving at Portsmouth Crown Court yesterday."

generated_summary = summarizer(text, max_length=34, min_length=10, do_sample=False)


All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at /content/drive/MyDrive/text_summarization.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.
Device set to use 0


In [None]:
print('Original Dialogue:\n')
print(text)
print('\n' * 2)

print('Model-generated Summary:\n')
print(generated_summary)

Original Dialogue:

A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years. Craig Eccleston-Todd, 27, was driving home from a night at a pub when he received a text message. As he was reading or replying to it, he veered across the road while driving round a bend and smashed into Rachel Titley’s car coming the other way. Craig Eccleston-Todd, 27 (left) was using his mobile phone when he crashed head-on into the car being driven by Rachel Titley, 28 (right). She died later from her injuries . The head-on crash took place in October 2013. Mr Eccleston-Todd's car was barely recognisable (pictured) Police said Eccleston-Todd had drunk at least three or four pints of beer before getting behind the wheel. He was found guilty of causing death by dangerous driving at Portsmouth Crown Court yesterday.



Model-generated Summary:

[{'summary_text': 'A drunk driver who crashed head-on into a car being driven by a woman who died lat