In [1]:
from google.colab import drive
import os
import pandas as pd
from collections import defaultdict
import re
import matplotlib.pyplot as plt

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Wait until Google Drive is mounted
while not os.path.exists('/content/drive/My Drive/'):
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Drive folder to save the file
debian_path = '/content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/Mailing Lists/data/final/'
ubuntu_path = '/content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/ubuntu.com/data/'

# Create the folder if it doesn't exist
#os.makedirs(folder_path, exist_ok=True)

In [4]:
def extract_author_and_id(from_string):
    if isinstance(from_string, str):  # Verifica se o valor é uma string
        # Expressão regular para extrair o nome e o e-mail
        match = re.match(r'(.*) <(.*)@(.*)>', from_string)
        if match:
            name = match.group(1).strip()
            email = match.group(2).strip()
            return name, email
    return None, None

In [5]:
def create_author_id_df(df):
    # Aplicar a função extract_author_and_id na coluna 'From'
    df[['author', 'id']] = df['From'].apply(lambda x: pd.Series(extract_author_and_id(x)))
    # Selecionar apenas as colunas 'author' e 'id'
    return df[['author', 'id']]

## Data Source

In [6]:
# Define the path to the CSV file for repository
csv_files = {
    "email_content_deity_apt.csv": "apt",
    "email_content_pkg_systemd_maintainers.csv": "systemd",
    "email_content_debian_glibc.csv": "glibc",
    "email_content_debian_dpkg.csv": "dpkg"
}

# Lista para armazenar os DataFrames
dfs = []

for file_name, source in csv_files.items():
    file_path = debian_path + file_name
    df = pd.read_csv(file_path)
    df['source'] = source
    dfs.append(df)

all_ubuntu = ubuntu_path + "ubuntu-devel-discuss.csv"

# Read commit history from the CSV file
debian_df = pd.concat(dfs, ignore_index=True)
ubuntu_df = pd.read_csv(all_ubuntu)

In [7]:
debian_df.head()

Unnamed: 0.6,To,Subject,From,Content,Date,Message-id,Reply-to,Link,References,In-reply-to,Cc,Mail-followup-to,source,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0
0,Debian Bug Tracking System <submit@bugs.debian...,"Bug#1041732: ""N: Missing Signed-By in the sour...",Jörn Heissler <debbugs2023-07@wulf.eu.org>,Package: apt\nVersion: 2.7.2\nSeverity: minor\...,"Sat, 22 Jul 2023 18:58:45 +0200",<[🔎]169004512558.1067093.13380889301189885061....,"Jörn Heissler <debbugs2023-07@wulf.eu.org>,104...",https://lists.debian.org/deity/2023/deity-2023...,,,,,apt,,,,,,
1,David Kalnischkies <david@kalnischkies.de>,Bug#1041708: marked as done (apt: Manpages hav...,"""Debian Bug Tracking System"" <owner@bugs.debia...","Your message dated Sat, 22 Jul 2023 22:34:40 +...","Sat, 22 Jul 2023 20:39:05 +0000",<[🔎]handler.1041708.D1041708.16900580992998101...,1041708@bugs.debian.org,https://lists.debian.org/deity/2023/deity-2023...,<20230722203440.mdhuy2vepxalrush@crossbow> <[🔎...,,,,apt,,,,,,
2,David Kalnischkies <david@kalnischkies.de>,"Bug#1041732: marked as done (""N: Missing Signe...","""Debian Bug Tracking System"" <owner@bugs.debia...","Your message dated Sat, 22 Jul 2023 22:36:41 +...","Sat, 22 Jul 2023 20:39:07 +0000",<[🔎]handler.1041732.D1041732.16900582062998628...,1041732@bugs.debian.org,https://lists.debian.org/deity/2023/deity-2023...,<20230722203641.qrr5rauoi3dutq5b@crossbow> <[🔎...,,,,apt,,,,,,
3,Debian Bug Tracking System <submit@bugs.debian...,Bug#1041750: apt-get changelog nvidia-driver f...,Allan Wind <allan@yaxto.com>,Package: apt\nVersion: 2.6.1\nSeverity: normal...,"Sat, 22 Jul 2023 23:14:43 -0400",<[🔎]ZLybI5ytFDdQQZXw@lifeintegrity.com>,"Allan Wind <allan@yaxto.com>,1041750@bugs.debi...",https://lists.debian.org/deity/2023/deity-2023...,,,,,apt,,,,,,
4,"Allan Wind <allan@yaxto.com>,1041750@bugs.debi...",Bug#1041750: apt-get changelog nvidia-driver f...,David Kalnischkies <david@kalnischkies.de>,"On Sat, Jul 22, 2023 at 11:14:43PM -0400, Alla...","Sun, 23 Jul 2023 13:43:02 +0200",<[🔎]20230723114302.o44uszplvfx3saaf@crossbow>,"David Kalnischkies <david@kalnischkies.de>,104...",https://lists.debian.org/deity/2023/deity-2023...,<[🔎]ZLybI5ytFDdQQZXw@lifeintegrity.com> <[🔎]ZL...,<[🔎]ZLybI5ytFDdQQZXw@lifeintegrity.com>,,,apt,,,,,,


In [8]:
ubuntu_df.head()

Unnamed: 0,From,To,Date,Subject,Body
0,Matthew Kuiken <matt.kuiken@verizon.net>,ubuntu-devel-discuss@lists.ubuntu.com,"Sat, 09 Dec 2006 08:28:13 -0800",gmane request.,I have requested that this list be added to gm...
1,John Richard Moser <nigelenki@comcast.net>,,"Sat, 09 Dec 2006 15:25:50 -0500",Re: NX bit broken on 32-bit,-----BEGIN PGP SIGNED MESSAGE-----\nHash: SHA1...
2,Colin Watson <cjwatson@ubuntu.com>,"ubuntu-devel@lists.ubuntu.com, ubuntu-devel-di...","Mon, 11 Dec 2006 13:13:31 +0000",Re: howto add a new vcs to /dev/,[Redirecting to ubuntu-devel-discuss; please r...
3,Jan Claeys <lists@janc.be>,Evan Hazlett <ejhazlett@gmail.com>,"Mon, 11 Dec 2006 17:41:21 +0100",Re: Concerns,"[Sent to the new ubuntu-devel-discuss list, to..."
4,"""Ernst Persson"" <ernstp@gmail.com>",ubuntu-devel-discuss@lists.ubuntu.com,"Tue, 12 Dec 2006 09:54:16 +0100",Re: New feature: mount local file systems in G...,"(Oh, there's a new mailing list!)\nHi Martin,\..."


## Extract Author & ID

In [9]:
debian_author_id_df = create_author_id_df(debian_df)
ubuntu_author_id_df = create_author_id_df(ubuntu_df)

In [10]:
debian_author_id_df.head()

Unnamed: 0,author,id
0,Jörn Heissler,debbugs2023-07
1,"""Debian Bug Tracking System""",owner
2,"""Debian Bug Tracking System""",owner
3,Allan Wind,allan
4,David Kalnischkies,david


In [11]:
ubuntu_author_id_df.head()

Unnamed: 0,author,id
0,Matthew Kuiken,matt.kuiken
1,John Richard Moser,nigelenki
2,Colin Watson,cjwatson
3,Jan Claeys,lists
4,"""Ernst Persson""",ernstp


## Comparison by Contributor Name (Full Match)

In [12]:
# Extraindo os autores únicos de cada dataframe
authors_debian = set(debian_author_id_df['author'].unique())
authors_ubuntu = set(ubuntu_author_id_df['author'].unique())

# Encontrando os autores comuns
common_authors = authors_debian.intersection(authors_ubuntu)

# Criando o novo dataframe com os autores comuns
df_common_authors = pd.DataFrame({
    'contributor_debian': list(common_authors),
    'contributor_ubuntu': list(common_authors)
})

# Exibindo o novo dataframe
df_common_authors

Unnamed: 0,contributor_debian,contributor_ubuntu
0,Mehdi Dogguy,Mehdi Dogguy
1,John Johansen,John Johansen
2,David Paleino,David Paleino
3,Ade Malsasa Akbar,Ade Malsasa Akbar
4,Chuan-kai Lin,Chuan-kai Lin
...,...,...
210,Sam Hartman,Sam Hartman
211,Dmitry Shachnev,Dmitry Shachnev
212,Ritesh Raj Sarraf,Ritesh Raj Sarraf
213,Jon,Jon


## Comparison by Contributor Email (User ID)

In [13]:
# Extraindo os nomes únicos de cada dataframe
users_debian = set(debian_author_id_df['id'].unique())
users_ubuntu = set(ubuntu_author_id_df['id'].unique())

# Encontrando os nomes comuns
common_users = users_debian.intersection(users_ubuntu)

# Criando o novo dataframe com os nomes comuns
df_common_users = pd.DataFrame({
    'contributor_debian': list(common_users),
    'contributor_ubuntu': list(common_users)
})

# Exibindo o novo dataframe
df_common_users

Unnamed: 0,contributor_debian,contributor_ubuntu
0,giuseppe,giuseppe
1,kirkland,kirkland
2,stefan,stefan
3,steve.langasek,steve.langasek
4,jesse,jesse
...,...,...
295,psusi,psusi
296,alexandre.detiste,alexandre.detiste
297,joshua,joshua
298,alexander,alexander
