In [None]:
# This notebook is for the collection of data for the Deepseek topic news corpus

In [3]:
from pygooglenews import GoogleNews
import pandas as pd
from datetime import datetime, timedelta
import time


In [None]:

def get_news(search, days=21):
    """ we decided to use the pygooglenews library to get the news data, although with a limit of 100 articles per call"""
    stories = []
    
    end_date = datetime.today()
    start_date = end_date - timedelta(days=days)
    date_list = pd.date_range(start=start_date, end=end_date).tolist()

    for date in date_list[:-1]:
        from_date = date.strftime('%Y-%m-%d')
        to_date = (date + timedelta(days=1)).strftime('%Y-%m-%d')

        query = f"{search} after:{from_date} before:{to_date}"
        
        print(f"Fetching news from {from_date} to {to_date}...")

        try:
            search_results = gn.search(query)  
            newsitems = search_results.get('entries', [])

            for item in newsitems:
                story = {
                    'title': item.title,
                    'link': item.link,
                    'published': item.published
                }
                stories.append(story)
            
            print(f"Found {len(newsitems)} articles for {from_date}")
        except Exception as e:
            print(f"Error on {from_date}: {e}")

        time.sleep(1)  # avoid rate limit

    return stories

In [None]:
# Initialize the GoogleNews object, keyword is DeepSeek
# we set the date range to 21 days, to cover the period since the release of the DeepSeek R1(which made DS went viral) till the day of the research
news_results = get_news("DeepSeek", days=21)


In [None]:
# we save the results to a text file

with open("../corpus/pygooglenews_results.txt", "w", encoding="utf-8") as f:
    for news in news_results:
        f.write(f"{news['title']}\n{news['link']}\nPublished: {news['published']}\n\n")

print(f"Saved {len(news_results)} articles'")

In [None]:
# now, with the date, title and link(redirection) of the news articles, we can proceed to the next step : scraping the content of the articles

filtered_news = []
with open("../corpus/pygooglenews_results.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()


# we only keep the articles that contain "DeepSeek" in the title
i = 0
while i < len(lines):
    title = lines[i].strip().lower()  
    if "deepseek" in title:
        filtered_news.append(lines[i])    
        filtered_news.append(lines[i+1])  
        filtered_news.append(lines[i+2])  
        filtered_news.append("\n")        
    i += 4  # 跳到下一条新闻

# save the filtered news to a new file, still in the same structure
with open("../corpus/filtered_deepseek_news.txt", "w", encoding="utf-8") as f:
    f.writelines(filtered_news)



In [None]:
# all the urls we got from the pygooglenews library are false because they redirect to the real news page

# We tried to use the newspaper3k library to scrape the content of the articles, but it was not working properly
# We tried NewsAPI as well as Google News API, but encountered daily limit issues

# so we did it partially manually, by automatically opening the links in the browser, then gathering all the tabs' URLs by a browser extension called OneTab.'
# then we saved the URLs to a text file called "final_manuel.txt"

import webbrowser
import time

with open("../corpus/filtered_deepseek_news.txt", "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f.readlines() if line.startswith("http")]

batch_size = 100  # open 100 links at a time
total_urls = len(lines)

for start in range(0, total_urls, batch_size):
    end = min(start + batch_size, total_urls)  
    
    for i in range(start, end):
        webbrowser.open(lines[i])  
        time.sleep(2)  
    
    if end < total_urls:
        input("\n Press Enter to open the next batch of links...")
    



---

In [4]:
# now we have the real URLs saved in the file "onetab_all_urls.txt", we organize them by deleting duplicates and saving them to a dataframe


real_urls_data = []
real_urls_file = "onetab_all_urls.txt"

with open(real_urls_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" | ")  
        if len(parts) >= 2:  
            real_urls_data.append([parts[0], parts[1]])


# 
real_urls_df = pd.DataFrame(real_urls_data, columns=["URL", "Title"]).drop_duplicates(subset="Title")


In [5]:
real_urls_df

Unnamed: 0,URL,Title
0,https://www.techtarget.com/whatis/feature/Deep...,DeepSeek explained: Everything you need to know
1,https://venturebeat.com/ai/open-source-deepsee...,Open-source DeepSeek-R1 uses pure reinforcemen...
2,https://siliconangle.com/2025/01/20/deepseek-o...,DeepSeek open-sources its R1 reasoning model s...
3,https://medium.com/data-science-in-your-pocket...,DeepSeek-R1 vs DeepSeek-R1-Zero. DeepSeek’s ne...
4,https://analyticsindiamag.com/ai-news-updates/...,DeepSeek Crushes OpenAI o1 with an MIT-License...
...,...,...
1504,https://newscentral.africa/us-lawmakers-seek-d...,US Lawmakers Seek DeepSeek Ban on Government D...
1505,https://ipdefenseforum.com/2025/02/prcs-ai-ass...,"PRC’s AI assistant, DeepSeek, censors informat..."
1506,https://www.baltimoresun.com/2025/02/07/lawmak...,baltimoresun.com/2025/02/07/lawmakers-push-to-...
1507,https://www.androidheadlines.com/2025/02/deeps...,DeepSeek and other Chinese AIs might be banned...


In [4]:
real_urls_df.to_csv("real_urls.csv", index=False)

In [6]:
# now we extract the Title, URL and Date from the filtered_deepseek_news.txt file, and save them to a dataframe so we can merge the date with the real URLs
original_file = "filtered_deepseek_news.txt"
with open(original_file, "r", encoding="utf-8") as f:
    news_blocks = f.read().strip().split("\n\n")  

# 
news_data = []
for news in news_blocks:
    lines = news.split("\n")
    if len(lines) >= 3:
        title = lines[0].strip()
        url = lines[1].strip()
        date = lines[2].replace("Published: ", "").strip()
        news_data.append([title, url, date])

news_df = pd.DataFrame(news_data, columns=["Title", "URL", "Date"])

In [7]:
news_df

Unnamed: 0,Title,URL,Date
0,DeepSeek Faces Global Scrutiny as Governments ...,https://news.google.com/rss/articles/CBMioAFBV...,"Sun, 19 Jan 2025 08:00:00 GMT"
1,DeepSeek explained: Everything you need to kno...,https://news.google.com/rss/articles/CBMikgFBV...,"Mon, 20 Jan 2025 08:00:00 GMT"
2,Open-source DeepSeek-R1 uses pure reinforcemen...,https://news.google.com/rss/articles/CBMiuwFBV...,"Mon, 20 Jan 2025 08:00:00 GMT"
3,DeepSeek open-sources its R1 reasoning model s...,https://news.google.com/rss/articles/CBMijAFBV...,"Mon, 20 Jan 2025 08:00:00 GMT"
4,DeepSeek-R1 vs DeepSeek-R1-Zero - Medium,https://news.google.com/rss/articles/CBMilgFBV...,"Mon, 20 Jan 2025 08:00:00 GMT"
...,...,...,...
1554,"PRC’s AI assistant, DeepSeek, censors informat...",https://news.google.com/rss/articles/CBMinAFBV...,"Fri, 07 Feb 2025 17:19:06 GMT"
1555,Lawmakers push to ban DeepSeek on government d...,https://news.google.com/rss/articles/CBMiuwFBV...,"Fri, 07 Feb 2025 21:20:47 GMT"
1556,DeepSeek and other Chinese AIs might be banned...,https://news.google.com/rss/articles/CBMipAFBV...,"Fri, 07 Feb 2025 16:35:32 GMT"
1557,Lawmakers push to ban DeepSeek on government d...,https://news.google.com/rss/articles/CBMi9wFBV...,"Fri, 07 Feb 2025 20:46:41 GMT"


In [8]:

from collections import Counter
from rapidfuzz import process, fuzz


In [10]:
assert isinstance(news_df, pd.DataFrame)
assert isinstance(real_urls_df, pd.DataFrame)


# Link the real URLs with the news data by matching the titles
news_titles = news_df["Title"].tolist()

def fuzzy_match_title(title, choices, threshold=70):

    match = process.extractOne(title, choices, scorer=fuzz.partial_ratio)
    if match and match[1] >= threshold:  
        return match[0]  
    return None

# perform the fuzzy matching
real_urls_df["Matched_Title"] = real_urls_df["Title"].apply(lambda x: fuzzy_match_title(x, news_titles))

# THE most important step, merge the real URLs with the news data which contains the date
real_urls_df = real_urls_df.merge(news_df[['Title', 'Date']], left_on='Matched_Title', right_on='Title', how='left')



# drop the unnecessary columns
real_urls_df.drop(columns=['Matched_Title', 'Title_y'], inplace=True)
real_urls_df.rename(columns={'Title_x': 'Title'}, inplace=True)

In [12]:
real_urls_df

Unnamed: 0,URL,Title,Date
0,https://www.techtarget.com/whatis/feature/Deep...,DeepSeek explained: Everything you need to know,"Mon, 20 Jan 2025 08:00:00 GMT"
1,https://www.techtarget.com/whatis/feature/Deep...,DeepSeek explained: Everything you need to know,"Mon, 20 Jan 2025 08:00:00 GMT"
2,https://venturebeat.com/ai/open-source-deepsee...,Open-source DeepSeek-R1 uses pure reinforcemen...,"Mon, 20 Jan 2025 08:00:00 GMT"
3,https://venturebeat.com/ai/open-source-deepsee...,Open-source DeepSeek-R1 uses pure reinforcemen...,"Mon, 20 Jan 2025 08:00:00 GMT"
4,https://siliconangle.com/2025/01/20/deepseek-o...,DeepSeek open-sources its R1 reasoning model s...,"Mon, 20 Jan 2025 08:00:00 GMT"
...,...,...,...
1505,https://newscentral.africa/us-lawmakers-seek-d...,US Lawmakers Seek DeepSeek Ban on Government D...,"Fri, 07 Feb 2025 17:51:23 GMT"
1506,https://ipdefenseforum.com/2025/02/prcs-ai-ass...,"PRC’s AI assistant, DeepSeek, censors informat...","Fri, 07 Feb 2025 17:19:06 GMT"
1507,https://www.baltimoresun.com/2025/02/07/lawmak...,baltimoresun.com/2025/02/07/lawmakers-push-to-...,"Fri, 07 Feb 2025 20:47:53 GMT"
1508,https://www.androidheadlines.com/2025/02/deeps...,DeepSeek and other Chinese AIs might be banned...,"Fri, 07 Feb 2025 16:35:32 GMT"


In [14]:
# drop the duplicates
real_urls_df = real_urls_df.drop_duplicates(subset=["URL"]).drop_duplicates(subset=["Title"])


In [17]:
real_urls_df

Unnamed: 0,URL,Title,Date
0,https://www.techtarget.com/whatis/feature/Deep...,DeepSeek explained: Everything you need to know,2025-01-20
2,https://venturebeat.com/ai/open-source-deepsee...,Open-source DeepSeek-R1 uses pure reinforcemen...,2025-01-20
4,https://siliconangle.com/2025/01/20/deepseek-o...,DeepSeek open-sources its R1 reasoning model s...,2025-01-20
6,https://medium.com/data-science-in-your-pocket...,DeepSeek-R1 vs DeepSeek-R1-Zero. DeepSeek’s ne...,2025-01-20
8,https://analyticsindiamag.com/ai-news-updates/...,DeepSeek Crushes OpenAI o1 with an MIT-License...,2025-01-20
...,...,...,...
1505,https://newscentral.africa/us-lawmakers-seek-d...,US Lawmakers Seek DeepSeek Ban on Government D...,2025-02-07
1506,https://ipdefenseforum.com/2025/02/prcs-ai-ass...,"PRC’s AI assistant, DeepSeek, censors informat...",2025-02-07
1507,https://www.baltimoresun.com/2025/02/07/lawmak...,baltimoresun.com/2025/02/07/lawmakers-push-to-...,2025-02-07
1508,https://www.androidheadlines.com/2025/02/deeps...,DeepSeek and other Chinese AIs might be banned...,2025-02-07


In [19]:
# convert the date to a standard format
real_urls_df["Date"] = pd.to_datetime(real_urls_df["Date"]).dt.strftime("%Y-%m-%d")

# drop the rows with missing dates
real_urls_df = real_urls_df.dropna(subset=["Date"])

In [20]:
real_urls_df.to_csv("final_data_no_content.csv", index=False)

Now we scrap

In [25]:
from tqdm import tqdm

def fetch_web_content_with_status(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        status_code = response.status_code  
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            paragraphs = soup.find_all("p")  
            content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
            return content if content else "No content found", status_code
        else:
            return f"Error: {response.status_code}", status_code

    except requests.RequestException as e:
        return f"Error: {str(e)}", None


In [36]:
df = real_urls_df.copy().reset_index(drop=True)

df["Content"] = ""
df["Status_Code"] = None

for i in tqdm(range(len(df)), desc="Fetching Web Content", unit="page"):
    url = df.loc[i, "URL"]
    content, status = fetch_web_content_with_status(url)
    df.loc[i, "Content"] = content
    df.loc[i, "Status_Code"] = status

Fetching Web Content: 100%|██████████| 1150/1150 [08:49<00:00,  2.17page/s]


In [27]:

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")


df = df.sort_values(by="Date", ascending=True)


output_filename = "deepseek_news_corpus.csv"
df.to_csv(output_filename, index=False, encoding="utf-8")


Fetching Web Content (Test Run): 100%|██████████| 10/10 [00:01<00:00,  6.22page/s]


In [46]:
df = df.drop(columns=["Validation", "Validation_Flag"], errors="ignore")

Validation of the dataset

In [52]:
# we add a column to check the quality of the data
df["Validation"] = 1


df.loc[df["Content"].str.len() < 100, "Validation"] = 0
df.loc[df["Status_Code"] != 200, "Validation"] = 0
df.loc[~df["Content"].str.contains("DeepSeek", case=False, na=False), "Validation"] = 0


df = df[df["Validation"] == 1].reset_index(drop=True)

In [56]:
df.to_csv("deepseek_news_corpus.csv", index=False, encoding="utf-8")

In [57]:
corpus = pd.read_csv("deepseek_news_corpus.csv")

In [58]:
corpus

Unnamed: 0,URL,Title,Date,Content,Status_Code,Validation
0,https://www.techtarget.com/whatis/feature/Deep...,DeepSeek explained: Everything you need to know,2025-01-20,"In the world of AI, there has been a prevailin...",200,1
1,https://venturebeat.com/ai/open-source-deepsee...,Open-source DeepSeek-R1 uses pure reinforcemen...,2025-01-20,Join our daily and weekly newsletters for the ...,200,1
2,https://siliconangle.com/2025/01/20/deepseek-o...,DeepSeek open-sources its R1 reasoning model s...,2025-01-20,\nUPDATED 17:12 EST / JANUARY 20 2025\n\n\n\n\...,200,1
3,https://medium.com/data-science-in-your-pocket...,DeepSeek-R1 vs DeepSeek-R1-Zero. DeepSeek’s ne...,2025-01-20,Sign up\nSign in\nSign up\nSign in\nMehul Gupt...,200,1
4,https://analyticsindiamag.com/ai-news-updates/...,DeepSeek Crushes OpenAI o1 with an MIT-License...,2025-01-20,"DeepSeek, a Chinese AI research lab backed by ...",200,1
...,...,...,...,...,...,...
832,https://www.techloy.com/gemini-2-0-flash-is-go...,Gemini 2.0 Flash is Google's latest response t...,2025-02-07,\n\n\nSuccess! Now Check Your Email\n\nTo comp...,200,1
833,https://newscentral.africa/us-lawmakers-seek-d...,US Lawmakers Seek DeepSeek Ban on Government D...,2025-02-07,A new bill to be introduced in the U.S. Congre...,200,1
834,https://ipdefenseforum.com/2025/02/prcs-ai-ass...,"PRC’s AI assistant, DeepSeek, censors informat...",2025-02-07,Voice of America\nUsers of the People’s Republ...,200,1
835,https://www.androidheadlines.com/2025/02/deeps...,DeepSeek and other Chinese AIs might be banned...,2025-02-07,Sign Up! \n\nenvelope_alt\n\n\n\nGet the lates...,200,1
