In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **1. Data Loading and Inspection**

In [3]:
file = pd.read_csv("/content/drive/MyDrive/GENAI/Week6/Day5/tennis_articles.csv", encoding='ISO-8859-1')
df = pd.DataFrame(file)
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP)  Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   article_id     8 non-null      int64 
 1   article_title  8 non-null      object
 2   article_text   8 non-null      object
 3   source         8 non-null      object
dtypes: int64(1), object(3)
memory usage: 388.0+ bytes


In [5]:
df.drop(columns=['article_title'], inplace=True)
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP)  Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


# **2. Sentence Tokenization**

In [6]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')
sentences = df['article_text'].apply(sent_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [7]:
all_sentences = [sentence for article in sentences for sentence in article]

In [8]:
all_sentences[:20]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.",
 "So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl.",
 "I say my hellos, but I'm not sending any players flowers as well.",
 "Uhm, I'm not really friendly or close to many players.",
 "I have not a lot of friends away from the courts.'",
 'When she said she is not really close to a lot of players, is that something strategic that she is doing?',
 "Is it different on the men's tour than the women's tour?",
 "'No, not at

# **3. Download and Load GloVe Word Embeddings**

In [9]:
!unzip "/content/drive/MyDrive/GENAI/Week6/Day5/Nouveau dossier.zip"

Archive:  /content/drive/MyDrive/GENAI/Week6/Day5/Nouveau dossier.zip
   creating: Nouveau dossier/
  inflating: Nouveau dossier/glove.6B.100d.txt  


In [10]:
glove_path = '/content/Nouveau dossier/glove.6B.100d.txt'

glove_embeddings = dict()

with open(glove_path, encoding='utf8') as f:
  for line in f:
    values = line.strip().split()
    word = values[0]
    vector = [float(val) for val in values[1:]]  # les 100 dimensions
    glove_embeddings[word] = vector

# **4. Text Cleaning and Normalization**

In [11]:
import re
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  text = text.lower()
  words = text.split()
  words = [word for word in words if word not in stop_words]
  return ' '.join(words)

cleaned_sentences = [clean_text(sentence) for sentence in all_sentences]
cleaned_sentences[0:5]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['maria sharapova basically friends tennis players wta tour',
 'russian player problems openly speaking recent interview said dont really hide feelings much',
 'think everyone knows job',
 'im courts im court playing im competitor want beat every single person whether theyre locker room across net',
 'im one strike conversation weather know next minutes go try win tennis match']

# **5. Sentence Vectorization**

In [15]:
embedding_dim = 100

sentence_vectors = list()

for sentence in cleaned_sentences:
  words = sentence.split()
  if len(words) == 0:
        sentence_vectors.append(np.zeros(embedding_dim))
        continue
  word_vectors = list()
  for word in words:
    if word in glove_embeddings:
      word_vectors.append(glove_embeddings[word])
    else:
      word_vectors.append(np.zeros(embedding_dim))
  sentence_vector = np.mean(word_vectors, axis=0)
  sentence_vectors.append(sentence_vector)

# **6. Similarity Matrix Construction**

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

sentence_matrix = np.array(sentence_vectors)
similarity_matrix = cosine_similarity(sentence_matrix)

In [18]:
similarity_matrix

array([[1.        , 0.64269711, 0.59156992, ..., 0.83326988, 0.67561618,
        0.56470363],
       [0.64269711, 1.        , 0.85573618, ..., 0.72950413, 0.83842342,
        0.71960634],
       [0.59156992, 0.85573618, 1.        , ..., 0.67785676, 0.83229937,
        0.6637378 ],
       ...,
       [0.83326988, 0.72950413, 0.67785676, ..., 1.        , 0.7565927 ,
        0.64456643],
       [0.67561618, 0.83842342, 0.83229937, ..., 0.7565927 , 1.        ,
        0.74560147],
       [0.56470363, 0.71960634, 0.6637378 , ..., 0.64456643, 0.74560147,
        1.        ]])

# **7. Graph Construction and Sentence Ranking**

In [19]:
import networkx as nx

graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(graph)

# **8. Summarization**

In [23]:
ranked_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [24]:
ranked_sentences[:10]

[(119, 0.008623834694010695),
 (39, 0.008549519110496923),
 (4, 0.008547759925602826),
 (66, 0.008525451494626936),
 (69, 0.008508264840387606),
 (32, 0.008506149859986748),
 (106, 0.008480298549092737),
 (56, 0.00841355671195386),
 (40, 0.008407758276268449),
 (102, 0.00839972099564333)]

In [26]:
top_sentence_indices = [idx for idx, _ in ranked_sentences[:10]]

In [27]:
top_sentence_indices

[119, 39, 4, 66, 69, 32, 106, 56, 40, 102]

In [29]:
print("Résumé :\n")
for idx in top_sentence_indices:
    print(f"- {all_sentences[idx]}")

Résumé :

- I was on a nice trajectorythen, Reid recalled.If I hadnt got sick, I think I could have started pushing towards the second week at the slams and then who knows. Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps.
- Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
- So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
- I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.
- I just felt like it really kind of changed where people were a little bit, definitely in the '90s, a lot more quiet, into themselves, and then it started to become better. Meanwhile, Federer is h