In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

from google.colab import drive

import os
from datetime import datetime

import pytz

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Wait until Google Drive is mounted
while not os.path.exists('/content/drive/My Drive/'):
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Drive folder to save the file
folder_path = '/content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/Mailing Lists/data/'

# Create the folder if it doesn't exist
#os.makedirs(folder_path, exist_ok=True)

In [None]:
#method to save a dataset to csv into Google Drive
def save_dataframe_to_csv(dataframe, folder_path, file_name):
    """
    Save a DataFrame to a CSV file with a timestamped file name.

    Args:
    - dataframe: pandas DataFrame to be saved
    - folder_path: path to the folder where the CSV file will be saved

    Returns:
    - file_path: full path to the saved CSV file
    """
    # Get the current time in your local timezone
    current_time = datetime.now()

    # Set the timezone to your local timezone
    local_timezone = pytz.timezone('Europe/Paris')

    # Convert current_time to your local timezone
    current_time_local = current_time.astimezone(local_timezone)

    # Format the timestamp with hour, minute, and seconds
    timestamp = current_time_local.strftime('%Y_%m_%d_%H_%M_%S')

    # Generate the file name with the current date
    file_name = f'{file_name}_{timestamp}.csv'
    file_path = os.path.join(folder_path, file_name)

    # Export the data to a CSV file with the generated file name
    dataframe.to_csv(file_path, index=False)

    return file_path

#LIST ALL CONTRIBUTORS

In [None]:
# Fetch data from a web page
response = requests.get('https://contributors.debian.org/')
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# Find the table containing the contributor data
table = soup.find('table')

In [None]:
# Extract data from the table
contributor_data = []
for row in table.find_all('tr')[1:]:
    cols = row.find_all('td')
    person = cols[0].text.strip()
    since = cols[1].text.strip()
    until = cols[2].text.strip()

    # Find the link in the first column
    link = cols[0].find('a')['href']
    clean_link = link.replace("/contributor/", "").replace("/", "")

    contributor_data.append([person, since, until, clean_link])

In [None]:
# Create a DataFrame
contributor_df = pd.DataFrame(contributor_data, columns=['Contributor', 'Since', 'Until', 'Account Id'])

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(contributor_df, folder_path, 'contributors_data_with_links')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/contributors_data_with_links_2024_02_06_23_42_14.csv


In [None]:
contributor_df

Unnamed: 0,Contributor,Since,Until,Account Id
0,sudip,October 2019,January 2024,sudip
1,abhijith,June 2015,February 2024,abhijith
2,sur5r,July 2007,February 2024,sur5r
3,georg,July 2011,January 2024,georg
4,Wookey,December 1999,February 2024,wookey
...,...,...,...,...
1494,suman rajan,June 2017,September 2023,sumanrajan-guest@alioth
1495,tomás zerolo,July 2008,February 2024,tzerolo-guest@alioth
1496,victory .deb,October 2009,January 2024,victory-guest@alioth
1497,xiao sheng wen(肖盛文),June 2005,February 2024,atzlinux


#LIST OF OFICIAL CONTRIBUTORS OF DEBIAN PROJECT

In [None]:
# Fetch data from a web page
response = requests.get('https://nm.debian.org/members/')
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# Find the table containing the contributor data
table = soup.find('table')

In [None]:
# Extract data from the table
oficial_debian_members_data = []
for row in table.find_all('tr')[1:]:
    cols = row.find_all('td')
    person = cols[0].text.strip()
    account_id = cols[1].text.strip()
    since = cols[2].text.strip()

    oficial_debian_members_data.append([person, account_id, since])

In [None]:
# Create a DataFrame
oficial_debian_members_df = pd.DataFrame(oficial_debian_members_data, columns=['Person', 'Account Name', 'Since'])

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(oficial_debian_members_df, folder_path, 'oficial_debian_members_data')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/oficial_debian_members_data_2024_02_06_22_54_55.csv


#LIST CONTRIBUTIONS AND TEAMS FOR EACH CONTRIBUTOR

In [None]:
# Create an empty list to store contributor data
contributions_data = []

# Iterate over each row in the contributor_df DataFrame
for index, row in contributor_df.iterrows():

    # Fetch data from the contributor's link
    response = requests.get('https://contributors.debian.org/contributor/' + row['Account Id'] + '/')
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the <p> tag with class "lead"
    lead_paragraph = soup.find('p', class_='lead')

    # Find the <i> tag inside the lead paragraph
    italic_tag = lead_paragraph.find('i')

    # Get the value of the <i> tag
    italic_text = italic_tag.text

    # Find the table containing the contributor's data
    table = soup.find('table')

    # Extract data from the table
    for row in table.find_all('tr')[1:]:
        cols = row.find_all('td')
        person = italic_text  # Use the name from the main DataFrame
        team = cols[1].text.strip()
        contribution = cols[2].text.strip()
        since = cols[3].text.strip()
        until = cols[4].text.strip()

        # Append the data to the contributions_data list
        contributions_data.append([person, team, contribution, since, until])

In [None]:
# Create a new DataFrame with the collected contributor data
contributor_details_df = pd.DataFrame(contributions_data, columns=['Contributor', 'Team', 'Contribution', 'Since', 'Until'])

In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(contributor_details_df, folder_path, 'contributions_data_by_team')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/contributions_data_by_team_2024_02_07_00_00_31.csv


In [None]:
contributor_details_df

Unnamed: 0,Contributor,Team,Contribution,Since,Until
0,sudip,wiki.debian.org,wiki editor\n(extra info),January 2020,November 2023
1,sudip,mentors.debian.net,mentors.d.n package commenter\n(extra info),February 2020,February 2020
2,sudip,mentors.debian.net,mentors.d.n package uploader\n(extra info),May 2020,June 2020
3,sudip,bugs.debian.org,bts correspondents\n(extra info),October 2019,January 2024
4,sudip,lists.debian.org,poster,October 2019,January 2024
...,...,...,...,...,...
19472,atzlinux,lists.debian.org,poster,October 2017,December 2020
19473,atzlinux,bugs.debian.org,bts correspondents\n(extra info),September 2019,December 2021
19474,znoteer-guest@alioth,lists.debian.org,poster,December 2018,December 2023
19475,znoteer-guest@alioth,wiki.debian.org,wiki editor\n(extra info),April 2017,June 2023
