In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment

from google.colab import drive

import os
from datetime import datetime

import pytz

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Wait until Google Drive is mounted
while not os.path.exists('/content/drive/My Drive/'):
    pass

Mounted at /content/drive


In [None]:
# Drive folder to save the file
folder_path = '/content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/Mailing Lists/data/'

# Create the folder if it doesn't exist
#os.makedirs(folder_path, exist_ok=True)

In [None]:
#method to save a dataset to csv into Google Drive
def save_dataframe_to_csv(dataframe, folder_path, file_name):
    """
    Save a DataFrame to a CSV file with a timestamped file name.

    Args:
    - dataframe: pandas DataFrame to be saved
    - folder_path: path to the folder where the CSV file will be saved

    Returns:
    - file_path: full path to the saved CSV file
    """
    # Get the current time in your local timezone
    current_time = datetime.now()

    # Set the timezone to your local timezone
    local_timezone = pytz.timezone('Europe/Paris')

    # Convert current_time to your local timezone
    current_time_local = current_time.astimezone(local_timezone)

    # Format the timestamp with hour, minute, and seconds
    timestamp = current_time_local.strftime('%Y_%m_%d_%H_%M_%S')

    # Generate the file name with the current date
    file_name = f'{file_name}_{timestamp}.csv'
    file_path = os.path.join(folder_path, file_name)

    # Export the data to a CSV file with the generated file name
    dataframe.to_csv(file_path, index=False)

    return file_path

In [None]:
#function to collect the links of the mailing lists and store in a DataFrame
def scrape_debian_package_archives(url):
    # Initialize an empty list to store the data
    table_data = []

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the div with id="content"
    content_div = soup.find('div', id='content')

    # Find all divs with class="index_include"
    include_divs = content_div.find_all('div', class_='index_include')

    # Loop through the include_divs
    for include_div in include_divs:
        # Find the ul tag
        ul_tag = include_div.find('ul')

        # Find all li tags within the ul tag
        li_tags = ul_tag.find_all('li')

        # Loop through the li tags
        for li in li_tags:
            # Extract the text of the li tag (year)
            year_text = li.get_text(strip=True)[:4]  # Get only the first 4 characters

            # Extract the links (a tags) within the li tag
            a_tags = li.find_all('a')

            # Loop through the a tags
            for a in a_tags:
                # Extract the text (month) and href (link) attributes of the a tag
                month = a.get_text(strip=True)
                link = a.get('href')
                link_by_date = link.replace("threads.html", "maillist.html")  # Replace threads.html with maillist.html

                if 2003 <= int(year_text) <= 2023:
                    # Append the data to the table_data list
                    table_data.append({'Year': year_text, 'Month': month, 'Link': url + link_by_date})

    return table_data

In [None]:
#function to collect the list of emails exchanged that is relacted to a specific package, gets the links of mailing list, and for each, collects the subjects including the link to the content
def extract_maillist(link_of_mailing_lists):
     # Initialize an empty list to store DataFrames for each URL
    dfs = []

    # Iterate through each row in the existing DataFrame
    for index, row in link_of_mailing_lists.iterrows():
        # Get the URL from the 'Link' column
        url = row['Link']
        year = row['Year']

        # Send a GET request to the URL
        response = requests.get(url)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Initialize an empty list to store data for the current URL
        data = []

        # Extract the date from each <p> tag and the email data from each <ul> tag
        for p_tag in soup.find_all('p'):
            date = p_tag.get_text(strip=True)[:6]
            for ul_tag in p_tag.find_all_next('ul'):
                for li_tag in ul_tag.find_all('li'):
                    strong_tag = li_tag.find('strong')
                    a_tag = strong_tag.find('a')
                    sender = li_tag.find('em').get_text(strip=True)
                    topic = strong_tag.get_text(strip=True)
                    link = a_tag['href']

                    full_date = date + ' ' + str(year)
                    clean_url = url.replace("maillist.html", "")

                    # Append the data to the list
                    data.append({'Date': full_date, 'Sender': sender, 'Topic': topic, 'Link': clean_url + link})

        # Create a DataFrame for the current URL's data
        df = pd.DataFrame(data)

        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    new_data = pd.concat(dfs, ignore_index=True)

    return new_data

In [None]:
# Function to collect the content of the email, receives as parameter the dataframe of subjects and uses the link included
def extract_email_content(mailing_list, start_index, end_index):
    dfs = []

    # Limit the DataFrame to the first 1 rows for testing
    ##mailing_list = mailing_list.head(5)


    # Slice the DataFrame to the specified range of rows
    mailing_list = mailing_list.iloc[start_index:end_index]

    # Iterate through each row in the mailing list DataFrame
    for index, row in mailing_list.iterrows():
        # Get the URL from the 'Link' column
        url = row['Link']

        # Send a GET request to the URL
        response = requests.get(url)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the start and end comments
        head_of_message = soup.find(string=lambda text: isinstance(text, Comment) and 'X-Head-of-Message' in text)
        head_of_message_end = soup.find(string=lambda text: isinstance(text, Comment) and 'X-Head-of-Message-End' in text)

        # If either comment is missing, skip this URL
        if not head_of_message or not head_of_message_end:
            print(f"Comments not found for URL: {url}")
            continue

        # Find the <ul> tag between the comments
        current_tag = head_of_message.find_next()
        while current_tag and current_tag != head_of_message_end:
            if current_tag.name == 'ul':
                ul_tag = current_tag
                break
            current_tag = current_tag.find_next()
        else:
            print(f"UL tag not found between comments for URL: {url}")
            continue  # If no <ul> tag found, skip this URL

        # Initialize an empty dictionary to store data for the current URL
        data = {}

        # Iterate through <li> tags within the <ul> tag
        for li_tag in ul_tag.find_all('li'):
            # Extract the topic from the <a> tag, if present
            #a_tag = li_tag.find('a')
            #topic = a_tag.get_text(strip=True) if a_tag else None

            # Extract the column name from the <em> tag
            em_tag = li_tag.find('em')
            column_name = em_tag.get_text(strip=True)

            # Extract the content from the <li> tag
            tag_value = li_tag.get_text(strip=True)
            tag_value = tag_value.replace(column_name + ':', '').strip()

            # Add data to the dictionary
            data[column_name] = tag_value

        # Find the <pre> tag
        pre_tag = soup.find('pre')
        if pre_tag:
            pre_content = pre_tag.get_text(strip=True)
            data['Content'] = pre_content

        #Add the link to the Dataset
        data['Link'] = url

        # Append the dictionary to the list
        dfs.append(data)

    # Create a DataFrame from the list of dictionaries
    result_df = pd.DataFrame(dfs)

    # Position the column
    if 'Content' in result_df.columns:
        result_df.insert(3, 'Content', result_df.pop('Content'))

    return result_df

#Debian Mailing Lists: debian-dpkg

##Development of dpkg
#####Discussions and maintenance of the dpkg suite, the basis of the Debian packaging system.
#####This list is not moderated; posting is allowed by anyone.

Posting address: debian-dpkg@lists.debian.org

Link: https://lists.debian.org/debian-dpkg/

In [None]:
url = "https://lists.debian.org/debian-dpkg/"
debian_dpkg_data = scrape_debian_package_archives(url)

In [None]:
# Create a DataFrame
debian_dpkg_df = pd.DataFrame(debian_dpkg_data, columns=['Year', 'Month', 'Link'])

In [None]:
debian_dpkg_df

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_dpkg_df, folder_path, 'links_of_mailing_lists_by_date_debian_dpkg')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/links_of_mailing_lists_by_date_debian_dpkg_2024_02_20_22_48_44.csv


##Emails

####List of emails

In [None]:
debian_dpkg_maillist_df = extract_maillist(debian_dpkg_df)

In [None]:
debian_dpkg_maillist_df

In [None]:
num_rows = len(debian_dpkg_maillist_df)
print("Number of rows:", num_rows)

Number of rows: 127141


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_dpkg_maillist_df, folder_path, 'mailing_lists_by_date_debian_dpkg')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/mailing_lists_by_date_debian_dpkg_2024_02_19_09_50_41.csv


####Content of emails

In [None]:
# Store the data in a DataFrame
email_content_debian_dpkg_df_fragment = extract_email_content(debian_dpkg_maillist_df, 0, 127141)

In [None]:
email_content_debian_dpkg_df

Unnamed: 0,To,Cc,Subject,Content,From,Date,Message-id,Reply-to,References,Link,In-reply-to,Mail-followup-to
0,Thomas Dickey <dickey@herndon4.his.com>,"Jeff Sheinberg <jeffsh@localnet.com>,157093@bu...",Bug#157093: ncurses-base: xterm-mono is not us...,Thomas Dickey writes:\n> I looked at this a li...,Sven Rudolph <sr1@sax.sax.de>,01 Jan 2003 10:53:08 +0100,<[🔎]87bs31tcvv.fsf@loom.sax.de>,"Sven Rudolph <sr1@sax.sax.de>,157093@bugs.debi...",<20020824213255.GA17404@bloatware.reston01.va....,https://lists.debian.org/debian-dpkg/2003/debi...,,
1,Sven Rudolph <sr1@sax.sax.de>,"Thomas Dickey <dickey@herndon4.his.com>,\tJeff...",Bug#157093: ncurses-base: xterm-mono is not us...,"On Wed, Jan 01, 2003 at 10:53:08AM +0100, Sven...",Thomas Dickey <dickey@herndon4.his.com>,"Wed, 1 Jan 2003 07:02:01 -0500",<[🔎]20030101120201.GB347@bloatware.reston01.va...,"dickey@herndon4.his.com,157093@bugs.debian.org",<15710.55824.596655.721273@l1.bsrd.net> <20020...,https://lists.debian.org/debian-dpkg/2003/debi...,<[🔎]87bs31tcvv.fsf@loom.sax.de>,
2,submit@bugs.debian.org,,Bug#174971: dpkg(8) should point the user towa...,Package: dpkg\nVersion: 1.10.9\nSeverity: mino...,Zefram <zefram@fysh.org>,"Thu, 2 Jan 2003 00:49:18 +0000",<[🔎]20030102004917.GA9797@fysh.org>,"Zefram <zefram@fysh.org>,174971@bugs.debian.org",,https://lists.debian.org/debian-dpkg/2003/debi...,,
3,submit@bugs.debian.org,,Bug#174973: dpkg-query -W default output forma...,Package: dpkg\nVersion: 1.10.9\nTags: patch\n\...,Zefram <zefram@fysh.org>,"Thu, 2 Jan 2003 01:13:10 +0000",<[🔎]20030102011310.GA12425@fysh.org>,"Zefram <zefram@fysh.org>,174973@bugs.debian.org",,https://lists.debian.org/debian-dpkg/2003/debi...,,
4,submit@bugs.debian.org,,Bug#174976: dpkg-query --showformat is poorly ...,Package: dpkg\nVersion: 1.10.9\nTags: patch\n\...,Zefram <zefram@fysh.org>,"Thu, 2 Jan 2003 01:52:16 +0000",<[🔎]20030102015216.GA15023@fysh.org>,"Zefram <zefram@fysh.org>,174976@bugs.debian.org",,https://lists.debian.org/debian-dpkg/2003/debi...,,
...,...,...,...,...,...,...,...,...,...,...,...,...
132136,Dpkg-Maintainers <debian-dpkg@lists.debian.org>,Helmut Grohne <helmut@subdivi.de>,Re: Proper way to do setcap in maintscript,Niels Thykier:,Niels Thykier <niels@thykier.net>,"Fri, 29 Dec 2023 16:49:07 +0100",<[🔎]99c4a647-ec51-4201-bf27-507f15ddbdc0@thyki...,,<bef7166f-c3ab-4ef4-ace5-1467e9d883ca@thykier....,https://lists.debian.org/debian-dpkg/2023/debi...,<bef7166f-c3ab-4ef4-ace5-1467e9d883ca@thykier....,
132137,debian-dpkg@lists.debian.org,,Processing of dupload_2.10.5_amd64.changes,dupload_2.10.5_amd64.changes uploaded successf...,Debian FTP Masters <ftpmaster@ftp-master.debia...,"Sat, 23 Dec 2023 00:13:18 +0000",<[🔎]E1rGpdn-00437x-Hv@usper.debian.org>,,,https://lists.debian.org/debian-dpkg/2023/debi...,,
132138,"Guillem Jover <guillem@debian.org>, Dpkg Devel...",,dupload_2.10.5_amd64.changes ACCEPTED into uns...,Thank you for your contribution to Debian.\n\n...,Debian FTP Masters <ftpmaster@ftp-master.debia...,"Sat, 23 Dec 2023 00:20:05 +0000",<[🔎]E1rGpkL-00H2Mx-SS@fasolo.debian.org>,,,https://lists.debian.org/debian-dpkg/2023/debi...,,
132139,Dpkg-Maintainers <debian-dpkg@lists.debian.org>,Helmut Grohne <helmut@subdivi.de>,Re: Proper way to do setcap in maintscript,Niels Thykier:,Niels Thykier <niels@thykier.net>,"Fri, 29 Dec 2023 16:49:07 +0100",<[🔎]99c4a647-ec51-4201-bf27-507f15ddbdc0@thyki...,,<bef7166f-c3ab-4ef4-ace5-1467e9d883ca@thykier....,https://lists.debian.org/debian-dpkg/2023/debi...,<bef7166f-c3ab-4ef4-ace5-1467e9d883ca@thykier....,


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(email_content_debian_dpkg_df, folder_path, 'email_content_debian_dpkg_0_127141_with_duplications')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/email_content_debian_dpkg_0_127141_with_duplications_2024_02_24_00_47_56.csv


#Debian Mailing Lists: debian-glibc

##Maintaining GNU C Library packages
#####Discussion on Debian packaging of the GNU C Library, the most important library on Debian systems.
#####This list is not moderated; posting is allowed by anyone.

Posting address: debian-glibc@lists.debian.org

Link: https://lists.debian.org/debian-glibc/

In [None]:
url = "https://lists.debian.org/debian-glibc/"
debian_glibc_data = scrape_debian_package_archives(url)

In [None]:
# Create a DataFrame
debian_glibc_df = pd.DataFrame(debian_glibc_data, columns=['Year', 'Month', 'Link'])

In [None]:
debian_glibc_df

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_glibc_df, folder_path, 'links_of_mailing_lists_by_date_debian_glibc')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/links_of_mailing_lists_by_date_debian_glibc_2024_02_24_01_02_43.csv


##Emails

####List of emails

In [None]:
debian_glibc_maillist_df = extract_maillist(debian_glibc_df)

In [None]:
debian_glibc_maillist_df

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_glibc_maillist_df, folder_path, 'mailing_lists_by_date_debian_glibc_maillist')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/mailing_lists_by_date_debian_glibc_maillist_2024_02_24_01_04_13.csv


####Content of emails

In [None]:
# Store the data in a DataFrame
email_content_debian_glibc_df_fragment = extract_email_content(debian_glibc_maillist_df, 500000, 505000)

In [None]:
try:
  # Check if email_content_debian_dpkg_df exists
  if isinstance(email_content_debian_glibc_df, pd.DataFrame):
      # If it exists, concatenate it with email_content_debian_dpkg_df_fragment
      email_content_debian_glibc_df = pd.concat([email_content_debian_glibc_df, email_content_debian_glibc_df_fragment], ignore_index=True)
  else:
      # If it does not exist, create an empty DataFrame
      email_content_debian_glibc_df = pd.DataFrame()
except NameError:
    # If email_content_debian_dpkg_df is not defined, create an empty DataFrame
    email_content_debian_glibc_df = pd.DataFrame()

In [None]:
len(email_content_debian_glibc_df)

143825

In [None]:
# Get the current time in your local timezone
current_time = datetime.now()

# Set the timezone to your local timezone
local_timezone = pytz.timezone('Europe/Paris')

# Convert current_time to your local timezone
current_time_local = current_time.astimezone(local_timezone)

# Format the timestamp with hour, minute, and seconds
timestamp = current_time_local.strftime('%Y_%m_%d_%H_%M_%S')

# Generate the file name with the current date
file_name = f'email_content_debian_glibc_0_72000_and_500000_571825_{timestamp}.csv'

# Construct the full file path
file_path = folder_path + file_name

# Save the DataFrame to a CSV file
email_content_debian_glibc_df.to_csv(file_path, escapechar='\\')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/email_content_debian_glibc_0_72000_and_500000_571825_2024_02_28_01_13_33.csv


#Debian Mailing Lists: debian-kernel

##Kernel packages in Debian
######Kernels used with Debian (Linux, Hurd, etc.), available patches and flavors, packaging issues, bug reports, porting issues, automated tools, and any other questions or patches that are kernel-related. Mostly bug reporting is done here.
#####This list is not moderated; posting is allowed by anyone.

Posting address: debian-kernel@lists.debian.org

Link: https://lists.debian.org/debian-kernel/

In [None]:
url = "https://lists.debian.org/debian-kernel/"
debian_kernel_data = scrape_debian_package_archives(url)

In [None]:
# Create a DataFrame
debian_kernel_df = pd.DataFrame(debian_kernel_data, columns=['Year', 'Month', 'Link'])

In [None]:
debian_kernel_df = pd.read_csv(folder_path + "links_of_mailing_lists_by_date_debian_kernel_2024_02_28_01_15_28.csv")

In [None]:
debian_kernel_df

Unnamed: 0,Year,Month,Link
0,2004,Mar,https://lists.debian.org/debian-kernel/2004/de...
1,2004,Apr,https://lists.debian.org/debian-kernel/2004/de...
2,2004,May,https://lists.debian.org/debian-kernel/2004/de...
3,2004,Jun,https://lists.debian.org/debian-kernel/2004/de...
4,2004,Jul,https://lists.debian.org/debian-kernel/2004/de...
...,...,...,...
233,2023,Aug,https://lists.debian.org/debian-kernel/2023/de...
234,2023,Sep,https://lists.debian.org/debian-kernel/2023/de...
235,2023,Oct,https://lists.debian.org/debian-kernel/2023/de...
236,2023,Nov,https://lists.debian.org/debian-kernel/2023/de...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_kernel_df, folder_path, 'links_of_mailing_lists_by_date_debian_kernel')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/links_of_mailing_lists_by_date_debian_kernel_2024_02_28_01_15_28.csv


##Emails

####List of emails

In [None]:
debian_kernel_maillist_df = extract_maillist(debian_kernel_df)

In [None]:
debian_kernel_maillist_df = pd.read_csv(folder_path + "mailing_lists_by_date_debian_kernel_maillist_2024_02_28_01_21_02.csv")

In [None]:
debian_kernel_maillist_df


Unnamed: 0,Date,Sender,Topic,Link
0,Mar 29 2004,Joe Nahmias,Welcome to the new debian-kernel list!,https://lists.debian.org/debian-kernel/2004/de...
1,Mar 29 2004,DevilX,unsubscribe,https://lists.debian.org/debian-kernel/2004/de...
2,Mar 29 2004,Dustin Lundquist,APIC error running 2.4.18-686-smp,https://lists.debian.org/debian-kernel/2004/de...
3,Mar 29 2004,Sven Luther,Re: Bug#219826 acknowledged by developer (Welc...,https://lists.debian.org/debian-kernel/2004/de...
4,Mar 29 2004,Sven Luther,Re: APIC error running 2.4.18-686-smp,https://lists.debian.org/debian-kernel/2004/de...
...,...,...,...,...
1193901,Dec 31 2023,Cordell Bloor,Bug#1059607: linux-image-6.1.0-16-amd64: Steam...,https://lists.debian.org/debian-kernel/2023/de...
1193902,Dec 31 2023,Roland Clobus,Re: Immediate fallouts from the big linux chan...,https://lists.debian.org/debian-kernel/2023/de...
1193903,Dec 31 2023,Paul Gevers,Bug#1059765: linux: isolation-machine autopkgt...,https://lists.debian.org/debian-kernel/2023/de...
1193904,Dec 31 2023,Bastian Blank,Bug#1059765: linux: isolation-machine autopkgt...,https://lists.debian.org/debian-kernel/2023/de...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_kernel_maillist_df, folder_path, 'mailing_lists_by_date_debian_kernel_maillist')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/mailing_lists_by_date_debian_kernel_maillist_2024_02_28_01_21_02.csv


####Content of emails

In [None]:
# Store the data in a DataFrame
email_content_debian_kernel_df_fragment = extract_email_content(debian_kernel_maillist_df, 1093906, 1095000)

In [None]:
try:
  # Check if email_content_debian_dpkg_df exists
  if isinstance(email_content_debian_kernel_df, pd.DataFrame):
      # If it exists, concatenate it with email_content_debian_dpkg_df_fragment
      email_content_debian_kernel_df = pd.concat([email_content_debian_kernel_df, email_content_debian_kernel_df_fragment], ignore_index=True)
  else:
      # If it does not exist, create an empty DataFrame
      email_content_debian_kernel_df = pd.DataFrame()
except NameError:
    # If email_content_debian_dpkg_df is not defined, create an empty DataFrame
    email_content_debian_kernel_df = pd.DataFrame()

In [None]:
#email_content_debian_kernel_df = pd.read_csv(folder_path + "email_content_debian_kernel_1120000_1193906_2024_03_01_10_38_20.csv")

In [None]:
len(email_content_debian_kernel_df)

100000

In [None]:
email_content_debian_kernel_df


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(email_content_debian_kernel_df, folder_path, 'email_content_debian_kernel_1093906_1193906')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/email_content_debian_kernel_1093906_1193906_2024_03_01_17_57_55.csv


#Debian Mailing Lists: debian-release

##Coordinating Debian releases
#####Coordination of Debian releases issues such as testing migrations, transitions and removals. This list should not be considered a discussion list; discussions related to releases issues should be held on more appropriate lists such as debian-devel, debian-legal or debian-project.
#####This list is not moderated; posting is allowed by anyone.

Posting address: debian-release@lists.debian.org

Link: https://lists.debian.org/debian-release/

In [None]:
url = "https://lists.debian.org/debian-release/"
debian_release_data = scrape_debian_package_archives(url)

In [None]:
# Create a DataFrame
debian_release_df = pd.DataFrame(debian_release_data, columns=['Year', 'Month', 'Link'])

In [None]:
debian_release_df = pd.read_csv(folder_path + "links_of_mailing_lists_by_date_debian_release_2024_03_01_18_44_25.csv")

In [None]:
debian_release_df

Unnamed: 0,Year,Month,Link
0,2003,Jan,https://lists.debian.org/debian-release/2003/d...
1,2003,Feb,https://lists.debian.org/debian-release/2003/d...
2,2003,Mar,https://lists.debian.org/debian-release/2003/d...
3,2003,Apr,https://lists.debian.org/debian-release/2003/d...
4,2003,May,https://lists.debian.org/debian-release/2003/d...
...,...,...,...
247,2023,Aug,https://lists.debian.org/debian-release/2023/d...
248,2023,Sep,https://lists.debian.org/debian-release/2023/d...
249,2023,Oct,https://lists.debian.org/debian-release/2023/d...
250,2023,Nov,https://lists.debian.org/debian-release/2023/d...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_release_df, folder_path, 'links_of_mailing_lists_by_date_debian_release')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/links_of_mailing_lists_by_date_debian_release_2024_03_01_18_44_25.csv


##Emails

####List of emails

In [None]:
debian_release_maillist_df = extract_maillist(debian_release_df)

In [None]:
debian_release_maillist_df = pd.read_csv(folder_path + "mailing_lists_by_date_debian_release_maillist_2024_03_01_18_55_39.csv")

In [None]:
debian_release_maillist_df

Unnamed: 0,Date,Sender,Topic,Link
0,Mar 07 2003,Adam DiCarlo,testing and sarge release goals,https://lists.debian.org/debian-release/2003/d...
1,Mar 07 2003,Anthony Towns,Assignments,https://lists.debian.org/debian-release/2003/d...
2,Mar 07 2003,Zbyszek Kurzyca,unsubscribe,https://lists.debian.org/debian-release/2003/d...
3,Mar 07 2003,Colin Watson,Re: Assignments,https://lists.debian.org/debian-release/2003/d...
4,Mar 25 2003,Anthony Towns,Assignments,https://lists.debian.org/debian-release/2003/d...
...,...,...,...,...
1092114,Dec 16 2023,Debian Bug Tracking System,Bug#1058700: marked as done (nmu: dar_2.7.13-2),https://lists.debian.org/debian-release/2023/d...
1092115,Dec 16 2023,Adrian Bunk,Re: Bug#1057755: Qt WebEngine Security Support...,https://lists.debian.org/debian-release/2023/d...
1092116,Dec 16 2023,Soren Stoutner,Re: Bug#1057755: Qt WebEngine Security Support...,https://lists.debian.org/debian-release/2023/d...
1092117,Dec 16 2023,Patrick Franz,Re: Bug#1057755: Qt WebEngine Security Support...,https://lists.debian.org/debian-release/2023/d...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_release_maillist_df, folder_path, 'mailing_lists_by_date_debian_release_maillist')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/mailing_lists_by_date_debian_release_maillist_2024_03_01_18_55_39.csv


####Content of emails

In [None]:
# Store the data in a DataFrame
email_content_debian_release_df_fragment = extract_email_content(debian_release_maillist_df, 992238, 1000238)

In [None]:
try:
  # Check if email_content_debian_dpkg_df exists
  if isinstance(email_content_debian_release_df, pd.DataFrame):
      # If it exists, concatenate it with email_content_debian_dpkg_df_fragment
      email_content_debian_release_df = pd.concat([email_content_debian_release_df, email_content_debian_release_df_fragment], ignore_index=True)
  else:
      # If it does not exist, create an empty DataFrame
      email_content_debian_release_df = pd.DataFrame()
except NameError:
    # If email_content_debian_dpkg_df is not defined, create an empty DataFrame
    email_content_debian_release_df = pd.DataFrame()

In [None]:
#email_content_debian_release_df = pd.read_csv(folder_path + "email_content_debian_release_1018238_1092119_2024_03_03_00_47_15.csv")

In [None]:
len(email_content_debian_release_df)

100000

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(email_content_debian_release_df, folder_path, 'email_content_debian_release_992238_1092119')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/email_content_debian_release_992238_1092119_2024_03_03_13_42_57.csv


#Debian Mailing Lists: deity (apt)

##APT packages maintenance
#####Debian has a friendly frontend to its package maintenance system. Its codename is deity (now known as APT) and its development is discussed here. The -digest is open to everyone.
#####This list is not moderated; posting is allowed by anyone.

Posting address: deity@lists.debian.org

Link: https://lists.debian.org/deity/

In [None]:
url = "https://lists.debian.org/deity/"
deity_apt_data = scrape_debian_package_archives(url)

In [None]:
# Create a DataFrame
deity_apt_df = pd.DataFrame(deity_apt_data, columns=['Year', 'Month', 'Link'])

In [None]:
deity_apt_df = pd.read_csv(folder_path + "links_of_mailing_lists_by_date_deity_apt_2024_03_03_13_51_07.csv")

In [None]:
deity_apt_df

Unnamed: 0,Year,Month,Link
0,2003,Jan,https://lists.debian.org/deity/2003/deity-2003...
1,2003,Feb,https://lists.debian.org/deity/2003/deity-2003...
2,2003,Mar,https://lists.debian.org/deity/2003/deity-2003...
3,2003,Apr,https://lists.debian.org/deity/2003/deity-2003...
4,2003,May,https://lists.debian.org/deity/2003/deity-2003...
...,...,...,...
247,2023,Aug,https://lists.debian.org/deity/2023/deity-2023...
248,2023,Sep,https://lists.debian.org/deity/2023/deity-2023...
249,2023,Oct,https://lists.debian.org/deity/2023/deity-2023...
250,2023,Nov,https://lists.debian.org/deity/2023/deity-2023...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(deity_apt_df, folder_path, 'links_of_mailing_lists_by_date_deity_apt')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/links_of_mailing_lists_by_date_deity_apt_2024_03_03_13_51_07.csv


##Emails

####List of emails

In [None]:
deity_apt_maillist_df = extract_maillist(deity_apt_df)

In [None]:
deity_apt_maillist_df = pd.read_csv(folder_path + "mailing_lists_by_date_deity_apt_maillist_2024_03_03_13_56_08.csv")

In [None]:
deity_apt_maillist_df

Unnamed: 0,Date,Sender,Topic,Link
0,Jan 01 2003,Jerry Quinn,Bug#174931: apt: apt-get source won't fallback...,https://lists.debian.org/deity/2003/deity-2003...
1,Jan 01 2003,Tollef Fog Heen,Bug#174945: apt: installing package with lots ...,https://lists.debian.org/deity/2003/deity-2003...
2,Jan 01 2003,Hajo Noerenberg,Bug#175034: apt/dpkg ipv6 ftp problem (EPTR),https://lists.debian.org/deity/2003/deity-2003...
3,Jan 01 2003,Marco d'Itri,Bug#175055: apt-utils: libraries not correctly...,https://lists.debian.org/deity/2003/deity-2003...
4,Jan 01 2003,Steven Homolya,Bug#175121: apt-get does not honour Default-Re...,https://lists.debian.org/deity/2003/deity-2003...
...,...,...,...,...
423897,Dec 28 2023,Patrice Duroux,Bug#1059629: apt: doing something like 'apt -t...,https://lists.debian.org/deity/2023/deity-2023...
423898,Dec 28 2023,Yahaira Villanueva,Buen día,https://lists.debian.org/deity/2023/deity-2023...
423899,Dec 29 2023,Andres Lozano,información adjunta,https://lists.debian.org/deity/2023/deity-2023...
423900,Dec 29 2023,Patrice Duroux,Bug#1059629: apt: doing something like 'apt -t...,https://lists.debian.org/deity/2023/deity-2023...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(deity_apt_maillist_df, folder_path, 'mailing_lists_by_date_deity_apt_maillist')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/mailing_lists_by_date_deity_apt_maillist_2024_03_03_13_56_08.csv


####Content of emails

In [None]:
# Store the data in a DataFrame
email_content_deity_apt_df_fragment = extract_email_content(deity_apt_maillist_df, 323902, 337902)

In [None]:
try:
  # Check if email_content_debian_dpkg_df exists
  if isinstance(email_content_deity_apt_df, pd.DataFrame):
      # If it exists, concatenate it with email_content_debian_dpkg_df_fragment
      email_content_deity_apt_df = pd.concat([email_content_deity_apt_df, email_content_deity_apt_df_fragment], ignore_index=True)
  else:
      # If it does not exist, create an empty DataFrame
      email_content_deity_apt_df = pd.DataFrame()
except NameError:
    # If email_content_debian_dpkg_df is not defined, create an empty DataFrame
    email_content_deity_apt_df = pd.DataFrame()

In [None]:
#email_content_deity_apt_df = pd.read_csv(folder_path + "email_content_deity_apt_353902_423902_2024_03_04_13_08_40.csv")

In [None]:
len(email_content_deity_apt_df)

100000

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(email_content_deity_apt_df, folder_path, 'email_content_deity_apt_323902_423902')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/email_content_deity_apt_323902_423902_2024_03_04_22_43_09.csv


#Debian Mailing Lists: debian-gcc

##Debian GCC Maintainers
#####Discussion on Debian packaging of GCC, the GNU compiler collection: bug reports, porting issues, any kind of questions or patches.
#####This list is not moderated; posting is allowed by anyone.

Posting address: debian-gcc@lists.debian.org

Link: https://lists.debian.org/debian-gcc/

In [None]:
url = "https://lists.debian.org/debian-gcc/"
debian_gcc_data = scrape_debian_package_archives(url)

In [None]:
# Create a DataFrame
debian_gcc_df = pd.DataFrame(debian_gcc_data, columns=['Year', 'Month', 'Link'])

In [None]:
debian_gcc_df = pd.read_csv(folder_path + "links_of_mailing_lists_by_date_debian_gcc_2024_03_04_23_27_37.csv")

In [None]:
debian_gcc_df

Unnamed: 0,Year,Month,Link
0,2003,Jan,https://lists.debian.org/debian-gcc/2003/debia...
1,2003,Feb,https://lists.debian.org/debian-gcc/2003/debia...
2,2003,Mar,https://lists.debian.org/debian-gcc/2003/debia...
3,2003,Apr,https://lists.debian.org/debian-gcc/2003/debia...
4,2003,May,https://lists.debian.org/debian-gcc/2003/debia...
...,...,...,...
247,2023,Aug,https://lists.debian.org/debian-gcc/2023/debia...
248,2023,Sep,https://lists.debian.org/debian-gcc/2023/debia...
249,2023,Oct,https://lists.debian.org/debian-gcc/2023/debia...
250,2023,Nov,https://lists.debian.org/debian-gcc/2023/debia...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_gcc_df, folder_path, 'links_of_mailing_lists_by_date_debian_gcc')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/links_of_mailing_lists_by_date_debian_gcc_2024_03_04_23_27_37.csv


##Emails

####List of emails

In [None]:
debian_gcc_maillist_df = extract_maillist(debian_gcc_df)

In [None]:
debian_gcc_maillist_df = pd.read_csv(folder_path + "mailing_lists_by_date_debian_gcc_maillist_2024_03_04_23_31_18.csv")

In [None]:
debian_gcc_maillist_df

Unnamed: 0,Date,Sender,Topic,Link
0,Jan 01 2003,Bdale Garbee,Bug#174906: gcc-3.2_1:3.2.2ds3-0pre3(unstable/...,https://lists.debian.org/debian-gcc/2003/debia...
1,Jan 01 2003,Matthias Klose,Bug#174906: gcc-3.2_1:3.2.2ds3-0pre3(unstable/...,https://lists.debian.org/debian-gcc/2003/debia...
2,Jan 01 2003,Debian Bug Tracking System,Bug#174906: marked as done (gcc-3.2_1:3.2.2ds3...,https://lists.debian.org/debian-gcc/2003/debia...
3,Jan 01 2003,Matthias Klose,help needed with logwatcher in 3.2.2ds3,https://lists.debian.org/debian-gcc/2003/debia...
4,Jan 01 2003,Daniel Jacobowitz,Re: help needed with logwatcher in 3.2.2ds3,https://lists.debian.org/debian-gcc/2003/debia...
...,...,...,...,...
769381,Dec 30 2023,Debian Bug Tracking System,Processed: Re: gcc-13: Please build gcc with -...,https://lists.debian.org/debian-gcc/2023/debia...
769382,Dec 30 2023,Debian Bug Tracking System,Processed: Re: gcc-13: Please build gcc with -...,https://lists.debian.org/debian-gcc/2023/debia...
769383,Dec 31 2023,Matthias Klose,Bug#1057469: gcc-13: Please build gcc with -mb...,https://lists.debian.org/debian-gcc/2023/debia...
769384,Dec 31 2023,Debian Bug Tracking System,Processed: Re: gcc-13: Please build gcc with -...,https://lists.debian.org/debian-gcc/2023/debia...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(debian_gcc_maillist_df, folder_path, 'mailing_lists_by_date_debian_gcc_maillist')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/mailing_lists_by_date_debian_gcc_maillist_2024_03_04_23_31_18.csv


####Content of emails

In [None]:
# Store the data in a DataFrame
email_content_debian_gcc_df_fragment = extract_email_content(debian_gcc_maillist_df, 669386, 689386)

In [None]:
try:
  # Check if email_content_debian_dpkg_df exists
  if isinstance(email_content_debian_gcc_df, pd.DataFrame):
      # If it exists, concatenate it with email_content_debian_dpkg_df_fragment
      email_content_debian_gcc_df = pd.concat([email_content_debian_gcc_df, email_content_debian_gcc_df_fragment], ignore_index=True)
  else:
      # If it does not exist, create an empty DataFrame
      email_content_debian_gcc_df = pd.DataFrame()
except NameError:
    # If email_content_debian_dpkg_df is not defined, create an empty DataFrame
    email_content_debian_gcc_df = pd.DataFrame()

In [None]:
email_content_debian_gcc_df = pd.read_csv(folder_path + "email_content_debian_gcc_669386_769386_2024_03_05_12_09_14.csv")

In [None]:
email_content_debian_gcc_df.head(2)

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Link,Reply-to,References,Cc,In-reply-to,Mail-followup-to
0,cvise@packages.debian.org,cvise is marked for autoremoval from testing,Debian testing autoremoval watch <noreply@rele...,cvise 2.8.0-1 is marked for autoremoval from t...,"Sat, 15 Jul 2023 04:39:08 +0000",<[🔎]E1qKX3k-007RYp-7y@respighi.debian.org>,https://lists.debian.org/debian-gcc/2023/debia...,,,,,
1,Matthias Klose <doko@debian.org>,Bug#1037615: marked as done (cvise: ftbfs with...,"""Debian Bug Tracking System"" <owner@bugs.debia...","Your message dated Sat, 15 Jul 2023 09:34:29 +...","Sat, 15 Jul 2023 07:39:03 +0000",<[🔎]handler.1037615.D1037615.16894064751331671...,https://lists.debian.org/debian-gcc/2023/debia...,1037615@bugs.debian.org,<e6791048-9a33-3bbf-9230-5e7fa0c59ad7@debian.o...,,,


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(email_content_debian_gcc_df, folder_path, 'email_content_debian_gcc_669386_769386')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/email_content_debian_gcc_669386_769386_2024_03_05_12_09_14.csv


#Debian Mailing Lists: pkg-systemd-maintainers


Posting address: pkg-systemd-maintainers@alioth-lists.debian.net

Link: https://alioth-lists.debian.net/pipermail/pkg-systemd-maintainers/

Alternactive Link: https://wiki.debian.org/AdditionalDebianMailingLists

In [None]:
# Fetch data from a web page
url_systemd = 'https://alioth-lists.debian.net/pipermail/pkg-systemd-maintainers/'
response = requests.get(url_systemd)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# Find the table containing the contributor data
table = soup.find('table')

In [None]:
# Extract data from the table
pkg_systemd_maintainers_data = []
for row in table.find_all('tr')[1:]:
    cols = row.find_all('td')
    # Extract the text (month) before the space character
    month = cols[0].text.strip().split(' ')[0]
    # Extract the text (year) after the space character and split at ':' to get only the numeric part
    year = cols[0].text.strip().split(' ')[1].split(':')[0]

    # Find the link in the first column
    link = cols[1].find('a')['href']
    link_by_date = link.replace("threads.html", "date.html")  # Replace threads.html with date.html

    if 2003 <= int(year) <= 2023:
        pkg_systemd_maintainers_data.append([year, month, url_systemd + link])

In [None]:
def extract_pkg_systemd_maintainers_maillist(existing_df):
    # Initialize an empty list to store DataFrames for each URL
    dfs = []

    # Iterate through each row in the existing DataFrame
    for index, row in existing_df.iterrows():
        # Get the URL from the 'Link' column
        url = row['Link']

        # Send a GET request to the URL
        response = requests.get(url)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <ul> tags
        ul_tags = soup.find_all('ul')

        #print("Number of <ul> tags found:", ul_tags)

        # Get the number of <ul> tags found
        num_ul_tags = len(ul_tags)


        # Initialize an empty list to store data for the current URL
        data = []

        # Find all <li> tags within the <ul> tag
        li_tags = soup.find_all('li')

        for li_tag in li_tags:
            # Extract text ignoring nested tags using find_all(string=True):
            text_contents = " ".join(child.strip() for child in li_tag.find_all(string=True) if child.name not in ['a'])

            # Extract sender considering potential presence of <i>:
            sender = text_contents.strip()
            if sender:  # Check if text is not empty (no nested <a> tags)
                i_tag = li_tag.find('i')
                if i_tag:
                    sender = i_tag.text.strip()  # Extract text from <i> if found

            # Extract the text from the <a> tag
            a_tag = li_tag.find('a')
            topic = a_tag.get_text(strip=True)

            # Extract the link from the <a> tag
            link = a_tag['href']

            clean_link = row['Link'].replace('thread.html', '')

            # Append the data to the list
            data.append({'Date': row['Month'] + ' ' + row['Year'], 'Sender': sender, 'Topic': topic, 'Link': clean_link + link})

        # Create a DataFrame for the current URL's data
        df = pd.DataFrame(data)

        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    new_data = pd.concat(dfs, ignore_index=True)

    return new_data


In [None]:
# Create a DataFrame
pkg_systemd_maintainers_df = pd.DataFrame(pkg_systemd_maintainers_data, columns=['Year', 'Month', 'Link'])
pkg_systemd_maintainers_df = pkg_systemd_maintainers_df.sort_values(by='Year')

In [None]:
pkg_systemd_maintainers_df = pd.read_csv(folder_path + "links_of_mailing_lists_by_date_pkg_systemd_maintainers_2024_03_05_12_13_02.csv")

In [None]:
pkg_systemd_maintainers_df

Unnamed: 0,Year,Month,Link
0,2013,March,https://alioth-lists.debian.net/pipermail/pkg-...
1,2013,December,https://alioth-lists.debian.net/pipermail/pkg-...
2,2013,November,https://alioth-lists.debian.net/pipermail/pkg-...
3,2013,October,https://alioth-lists.debian.net/pipermail/pkg-...
4,2013,September,https://alioth-lists.debian.net/pipermail/pkg-...
...,...,...,...
125,2023,March,https://alioth-lists.debian.net/pipermail/pkg-...
126,2023,February,https://alioth-lists.debian.net/pipermail/pkg-...
127,2023,January,https://alioth-lists.debian.net/pipermail/pkg-...
128,2023,June,https://alioth-lists.debian.net/pipermail/pkg-...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(pkg_systemd_maintainers_df, folder_path, 'links_of_mailing_lists_by_date_pkg_systemd_maintainers')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/links_of_mailing_lists_by_date_pkg_systemd_maintainers_2024_03_05_12_13_02.csv


##Emails

####List of emails

In [None]:
pkg_systemd_maintainers_maillist_df = extract_pkg_systemd_maintainers_maillist(pkg_systemd_maintainers_df)

In [None]:
pkg_systemd_maintainers_maillist_df

Unnamed: 0,Date,Sender,Topic,Link
0,March 2013,Messages sorted by: [ subject ] [ author ] ...,[ subject ],https://alioth-lists.debian.net/pipermail/pkg-...
1,March 2013,More info on this list...,More info on this list...,https://alioth-lists.debian.net/pipermail/pkg-...
2,March 2013,Michael Biebl,[Pkg-systemd-maintainers] test,https://alioth-lists.debian.net/pipermail/pkg-...
3,March 2013,Michael Stapelberg,[Pkg-systemd-maintainers] Bug#703571: sysvinit...,https://alioth-lists.debian.net/pipermail/pkg-...
4,March 2013,Michael Stapelberg,[Pkg-systemd-maintainers] Bug#704197: Please r...,https://alioth-lists.debian.net/pipermail/pkg-...
...,...,...,...,...
27841,December 2023,Debian Bug Tracking System,"Processed: unarchiving 1056135, fixed 1056135 ...",https://alioth-lists.debian.net/pipermail/pkg-...
27842,December 2023,Debian Bug Tracking System,Processed: tagging 1058880,https://alioth-lists.debian.net/pipermail/pkg-...
27843,December 2023,Debian Bug Tracking System,Processed: archiving 1056135,https://alioth-lists.debian.net/pipermail/pkg-...
27844,December 2023,Messages sorted by: [ subject ] [ author ] ...,[ subject ],https://alioth-lists.debian.net/pipermail/pkg-...


In [None]:
#Remove garbage rows, data that is not an email but was collected because was used the same HTML tag has the one that structure de real data
filtered_pkg_systemd_maintainers_maillist_df = pkg_systemd_maintainers_maillist_df[~((pkg_systemd_maintainers_maillist_df['Sender'].str.contains("Messages sorted by:")))]

filtered_pkg_systemd_maintainers_maillist_df = filtered_pkg_systemd_maintainers_maillist_df[~((filtered_pkg_systemd_maintainers_maillist_df['Sender'].str.contains("More info on this list")))]

In [None]:
filtered_pkg_systemd_maintainers_maillist_df = pd.read_csv(folder_path + "mailing_lists_by_date_filtered_pkg_systemd_maintainers_maillist_2024_03_05_12_14_23.csv")

In [None]:
filtered_pkg_systemd_maintainers_maillist_df

Unnamed: 0,Date,Sender,Topic,Link
0,March 2013,Michael Biebl,[Pkg-systemd-maintainers] test,https://alioth-lists.debian.net/pipermail/pkg-...
1,March 2013,Michael Stapelberg,[Pkg-systemd-maintainers] Bug#703571: sysvinit...,https://alioth-lists.debian.net/pipermail/pkg-...
2,March 2013,Michael Stapelberg,[Pkg-systemd-maintainers] Bug#704197: Please r...,https://alioth-lists.debian.net/pipermail/pkg-...
3,March 2013,Michael Biebl,[Pkg-systemd-maintainers] VIO on sparc / udev ...,https://alioth-lists.debian.net/pipermail/pkg-...
4,December 2013,gustavo panizzo <gfa>,[Pkg-systemd-maintainers] Bug#729272: systemd ...,https://alioth-lists.debian.net/pipermail/pkg-...
...,...,...,...,...
27321,December 2023,Debian Bug Tracking System,Bug#1051843: marked as done (systemd: Restart=...,https://alioth-lists.debian.net/pipermail/pkg-...
27322,December 2023,Debian Bug Tracking System,Bug#1053872: marked as done (systemd with high...,https://alioth-lists.debian.net/pipermail/pkg-...
27323,December 2023,Debian Bug Tracking System,"Processed: unarchiving 1056135, fixed 1056135 ...",https://alioth-lists.debian.net/pipermail/pkg-...
27324,December 2023,Debian Bug Tracking System,Processed: tagging 1058880,https://alioth-lists.debian.net/pipermail/pkg-...


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(filtered_pkg_systemd_maintainers_maillist_df, folder_path, 'mailing_lists_by_date_filtered_pkg_systemd_maintainers_maillist')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/mailing_lists_by_date_filtered_pkg_systemd_maintainers_maillist_2024_03_05_12_14_23.csv


####Content of emails

In [None]:
# Function to collect the content of the email, receives as parameter the dataframe of subjects and uses the link included
def extract_email_content_pkg(mailing_list, start_index, end_index):
    dfs = []

    # Limit the DataFrame to the first 1 rows for testing
    ##mailing_list = mailing_list.head(5)


    # Slice the DataFrame to the specified range of rows
    mailing_list = mailing_list.iloc[start_index:end_index]

    step = end_index;
    print(step)

    # Iterate through each row in the mailing list DataFrame
    for index, row in mailing_list.iterrows():
        # Get the URL from the 'Link' column
        url = row['Link']

        # Send a GET request to the URL
        response = requests.get(url)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        #ul_tag = soup.find('ul')

        title_tag = soup.find('h1')
        date_tag = soup.find('i')

      # Find the <B> tag after the h1
        current_tag = title_tag.find_next()
        while current_tag and current_tag != date_tag:
            if current_tag.name == 'b':
                sender_tag = current_tag
                break
            current_tag = current_tag.find_next()
        else:
            print(f"B tag with the title not found for URL: {url}")
            continue  # If no <B> tag found, skip this URL


        current_tag = sender_tag.find_next()
        while current_tag and current_tag != date_tag:
            if current_tag.name == 'a':
                sender_email_tag = current_tag
                break
            current_tag = current_tag.find_next()
        else:
            print(f"A tag with the email of the sender not found for URL: {url}")
            continue  # If no <B> tag found, skip this URL


        # Initialize an empty dictionary to store data for the current URL
        data = {}

        # Find the <pre> tag
        pre_tag = soup.find('pre')
        if pre_tag:
            pre_content = pre_tag.get_text(strip=True)
            data['Content'] = pre_content

        #Add the link to the Dataset
        data['Link'] = url
        data['Subject'] = title_tag.get_text(strip=True)
        data['Date'] = date_tag.get_text(strip=True)
        data['From'] = sender_tag.get_text(strip=True) + " <" + sender_email_tag.get_text(strip=True).replace(" at ", "@") + ">"
        data['To'] = "NaN"
        data['Cc'] = "NaN"
        data['Message-id'] = "NaN"
        data['Reply-to'] = "NaN"
        data['References'] = "NaN"
        data['In-reply-to'] = "NaN"
        data['Mail-followup-to'] = "NaN"

        # Append the dictionary to the list
        dfs.append(data)

        step = step -1
        print(step)


    # Create a DataFrame from the list of dictionaries
    result_df = pd.DataFrame(dfs, columns=['To',	'Cc',	'Subject',	'Content',	'From',	'Date',	'Message-id',	'Reply-to',	'References',	'Link',	'In-reply-to',	'Mail-followup-to'])

    # Position the column
    if 'Content' in result_df.columns:
        result_df.insert(1, 'Content', result_df.pop('Content'))

    return result_df


In [None]:
# Store the data in a DataFrame
email_content_pkg_systemd_maintainers_df_fragment = extract_email_content_pkg(filtered_pkg_systemd_maintainers_maillist_df, 0, 3000)

3000
2999
2998
2997
2996
2995
2994
2993
2992
2991
2990
2989
2988
2987
2986
2985
2984
2983
2982
2981
2980
2979
2978
2977
2976
2975
2974
2973
2972
2971
2970
2969
2968
2967
2966
2965
2964
2963
2962
2961
2960
2959
2958
2957
2956
2955
2954
2953
2952
2951
2950
2949
2948
2947
2946
2945
2944
2943
2942
2941
2940
2939
2938
2937
2936
2935
2934
2933
2932
2931
2930
2929
2928
2927
2926
2925
2924
2923
2922
2921
2920
2919
2918
2917
2916
2915
2914
2913
2912
2911
2910
2909
2908
2907
2906
2905
2904
2903
2902
2901
2900
2899
2898
2897
2896
2895
2894
2893
2892
2891
2890
2889
2888
2887
2886
2885
2884
2883
2882
2881
2880
2879
2878
2877
2876
2875
2874
2873
2872
2871
2870
2869
2868
2867
2866
2865
2864
2863
2862
2861
2860
2859
2858
2857
2856
2855
2854
2853
2852
2851
2850
2849
2848
2847
2846
2845
2844
2843
2842
2841
2840
2839
2838
2837
2836
2835
2834
2833
2832
2831
2830
2829
2828
2827
2826
2825
2824
2823
2822
2821
2820
2819
2818
2817
2816
2815
2814
2813
2812
2811
2810
2809
2808
2807
2806
2805
2804
2803
2802
2801


In [None]:
try:
  # Check if email_content_debian_dpkg_df exists
  if isinstance(email_content_pkg_systemd_maintainers_df, pd.DataFrame):
      # If it exists, concatenate it with email_content_debian_dpkg_df_fragment
      email_content_pkg_systemd_maintainers_df = pd.concat([email_content_pkg_systemd_maintainers_df, email_content_pkg_systemd_maintainers_df_fragment], ignore_index=True)
  else:
      # If it does not exist, create an empty DataFrame
      email_content_pkg_systemd_maintainers_df = pd.DataFrame()
except NameError:
    # If email_content_debian_dpkg_df is not defined, create an empty DataFrame
    email_content_pkg_systemd_maintainers_df = pd.DataFrame()

In [None]:
#email_content_pkg_systemd_maintainers_df = pd.read_csv(folder_path + "email_content_pkg_systemd_maintainers_3000_27326_2024_03_06_14_03_15.csv")

In [None]:
len(email_content_pkg_systemd_maintainers_df)

22323

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(email_content_pkg_systemd_maintainers_df, folder_path, 'email_content_pkg_systemd_maintainers_0_27326')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/email_content_pkg_systemd_maintainers_0_27326_2024_03_06_19_17_22.csv
