In [13]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/cnn_train.csv",nrows=1000)

In [15]:
data.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [16]:
word_embeddings = {}
f = open('/content/drive/MyDrive/Colab Notebooks/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [17]:
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def generate_summary(article):
    sentences = sent_tokenize(article)
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ",regex=False)
    clean_sentences = [s.lower() for s in clean_sentences]
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((300,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((300,))
        sentence_vectors.append(v)

    sim_mat = np.zeros([len(sentences), len(sentences)])
    from sklearn.metrics.pairwise import cosine_similarity
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]

    import networkx as nx
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    summarize_text = []
    for i in range(3):
        summarize_text.append(ranked_sentences[i][1])
    return '. '.join(summarize_text)


In [19]:
# generate summary for each article
for i in range(len(data)):
    article = data['article'][i]
    summary = generate_summary(article)
    print('Article', i+1, 'Summary:')
    print(summary)
    print('') # blank line between summaries

Article 1 Summary:
UPDATED: .. PUBLISHED: .. By .

Article 2 Summary:
Mata has worked for the Miami-Dade Police Department since 1992, including directing investigations in Miami Gardens and working as a lieutenant in the K-9 unit at Miami International Airport, according to the complaint.. A criminal complaint unsealed in U.S. District Court in New Jersey Tuesday accuses Mata, also known as "The Milk Man," of using his role as a police officer to help the drug trafficking organization in exchange for money and gifts, including a Rolex watch.. "Ultimately, the (organization) decided not to move forward with the murder plot, but Mata still received a payment for setting up the meetings," federal prosecutors said in a statement.

Article 3 Summary:
Eccleston-Todd was found guilty of causing death by dangerous driving following a trial at Portsmouth Crown Court (pictured) He added: 'Mr Eccleston-Todd will now spend six years behind bars, but Rachel's family have lost her forever.. 'The an

In [20]:
!pip install rouge
from rouge import Rouge
rouge = Rouge()
for index, row in data.iterrows():
    article = row['article']
    highlights = row['highlights']
    scores = rouge.get_scores(summary, highlights)
    print("Article", index+1)
    print("ROUGE-1:", scores[0]['rouge-1'])
    print("ROUGE-2:", scores[0]['rouge-2'])
    print("ROUGE-L:", scores[0]['rouge-l'])

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Article 1
ROUGE-1: {'r': 0.18181818181818182, 'p': 0.13953488372093023, 'f': 0.1578947319286705}
ROUGE-2: {'r': 0.030303030303030304, 'p': 0.0196078431372549, 'f': 0.023809519039116604}
ROUGE-L: {'r': 0.15151515151515152, 'p': 0.11627906976744186, 'f': 0.13157894245498633}
Article 2
ROUGE-1: {'r': 0.06060606060606061, 'p': 0.046511627906976744, 'f': 0.05263157403393398}
ROUGE-2: {'r': 0.0, 'p': 0.0, 'f': 0.0}
ROUGE-L: {'r': 0.06060606060606061, 'p': 0.046511627906976744, 'f': 0.05263157403393398}
Article 3
ROUGE-1: {'r': 0.12280701754385964, 'p': 0.16279069767441862, 'f': 0.13999999509800015}
ROUGE-2: {'r': 0.031746031746031744, 'p': 0.0392156862745098, 'f': 0.03508771435364797}
ROUGE-L: {'r': 0.12280701754385964, 'p': 0.16279069767441862, 'f': 0.13999999509800015}
Article 4
ROUGE-1: {'r': 0.11363636363636363, 'p': 0.11627906976744186, 'f': 0.11494252373629299}
ROUGE-2: {'r': 0.0212765957

In [21]:
# Calculate average ROUGE scores across all articles
rouge_1_scores = [s['rouge-1']['f'] for s in scores]
rouge_2_scores = [s['rouge-2']['f'] for s in scores]
rouge_l_scores = [s['rouge-l']['f'] for s in scores]

overall_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
overall_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
overall_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print("Overall ROUGE-1: {:.2f}".format(overall_rouge_1))
print("Overall ROUGE-2: {:.2f}".format(overall_rouge_2))
print("Overall ROUGE-L: {:.2f}".format(overall_rouge_l))

Overall ROUGE-1: 0.16
Overall ROUGE-2: 0.02
Overall ROUGE-L: 0.16


In [30]:
text = "As Australian Fashion Week comes to a close, a new damning report has named and shamed some of the worst clothing brands sold in Australia and their companies, for the ongoing exploitation of their overseas workers. Lowes, Industrie, Best & Less and the Just Group - which includes Just Jeans, Portmans and Dotti - were identified as some of the worst performing companies by The 2015 Australian Fashion Report. Amongst the best performers were Etiko, Audrey Blue, Cotton On, H&M and Zara. The report assessed the labour rights management systems of 59 companies and 219 brands operating in Australia. The 2015 Australian Fashion Report has named and shamed some of the worst Aussie clothing brands and companies for their ongoing exploitation of overseas workers . Amongst the best performers were Etiko, Audrey Blue, Cotton On, H&M and Zara . It found that only two of the companies could prove they were paying a full living wage to the workers in two of the three production stages of their clothing. None of the 59 companies could prove the workers at their raw material suppliers were paid a living wage. Unlike a country's legally set minimum wage, a living wage ensures that an employee has enough money to cover the necessities - like food, water, electricity and shelter - and still has a little left over for themselves and their dependants. In some countries like Bangladesh, where the minimum wage is as little as US$68 a month and a living wage is US$104, the difference can be made by paying each worker just an additional 30c per t-shirt. Lowes, Industrie, Best & Less and the Just Group - which includes Just Jeans, Portmans and Dotti - were identified as some of the worst performers . 'The whole point in our reporting scorecard is if these companies don't have rigours systems in place to mitigate against those risks then you can't be sure that there is no forced labour or child labour in their supply chain,' Gershon Nimbalker, an advocacy manager at Baptist World Aid, said . 'A mere 12 per cent of companies could demonstrate any action towards paying wages above the legal minimum, and even then, only for part of their supply chain,' the report states. 'Furthermore, 91 per cent of companies still don't know where all their cotton comes from and 75 per cent don't know the source of all their fabrics and inputs. 'If companies don't know how and where their products are made, then there's no way for them to ensure that their workers are protected.' Uzbekistan for instance, the world's fifth largest exporter of cotton, was notorious for its  child labour policies which saw children as young as 10 forced to work in the fields until the government recently  improved conditions by renouncing the use of child labour 'on a systematic basis'. 'Furthermore, 91 per cent of companies still don't know where all their cotton comes from and 75 per cent don't know the source of all their fabrics and inputs,' the report stated . Gershon Nimbalker, an advocacy manager at Baptist World Aid told Daily Mail Australia that part of the motivation behind the report was to shed light on how many of the world's 165 million children involved in child labour were employed by the fashion industry. 'The whole point in our reporting scorecard is if these companies don't have rigours systems in place to mitigate against those risks then you can't be sure that there is no forced labour or child labour in their supply chain,' he said. 'There were 61 assessment criteria that we used to grade the companies that were put together with lots of collaboration with international labour rights organisations. 'We found all the public information available on the companies - public statements, anything online - and compiled and assessed it before sending a copy to the company and asking for feedback or asking them to tell us what we missed. The report comes almost two years after over 1,100 Bangladeshi garment workers died when the Rana Plaza factory collapsed in Bangladesh due to building safety problems . 'We found all the public information available on the companies - public statements, anything online - and compiled and assessed it before sending a copy to the company and asking for feedback or asking them to tell us what we missed. 'The worst grades basically mean that they have very little public information available about what they're doing to protect workers and on top of that that they haven't engaged with our research process. About 75 per cent did engage, but one quarter didn't.' However the report also noted some progress in the industry with companies like Kmart and Cotton On improving their transparency by identifying their suppliers, and H&M, Zara, Country Road and the Sussan Group showing attempts to improve their international worker's pay."
summary_text = generate_summary(text)

In [31]:
print(summary_text)

'The worst grades basically mean that they have very little public information available about what they're doing to protect workers and on top of that that they haven't engaged with our research process.. It found that only two of the companies could prove they were paying a full living wage to the workers in two of the three production stages of their clothing.. 'A mere 12 per cent of companies could demonstrate any action towards paying wages above the legal minimum, and even then, only for part of their supply chain,' the report states.


In [32]:
original_summary = "Australian Fashion Report revealed the Australian-sold brands and companies that ignore the exploitation of their overseas workers .Lowes, Industrie, Best & Less and the Just Group - which includes Just Jeans, Portmans and Dotti - were  some of the worst performers.Etiko, Audrey Blue, Cotton On, H&M and Zara had some of the best scores.75 per cent of companies don't know the source of all their fabrics and inputs ."
scores = rouge.get_scores(summary_text, original_summary)
print("ROUGE-1:", scores[0]['rouge-1'])
print("ROUGE-2:", scores[0]['rouge-2'])
print("ROUGE-L:", scores[0]['rouge-l'])

ROUGE-1: {'r': 0.21153846153846154, 'p': 0.15492957746478872, 'f': 0.1788617837371936}
ROUGE-2: {'r': 0.07462686567164178, 'p': 0.056818181818181816, 'f': 0.06451612412403783}
ROUGE-L: {'r': 0.19230769230769232, 'p': 0.14084507042253522, 'f': 0.16260162113556761}
