In [None]:
import os
import pandas as pd
from google.colab import drive
from datetime import datetime
import pytz
import re

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Wait until Google Drive is mounted
while not os.path.exists('/content/drive/My Drive/'):
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Drive folder to save the file
folder_path = '/content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/Mailing Lists/data/final/'

# Create the folder if it doesn't exist
#os.makedirs(folder_path, exist_ok=True)

***Methods***

In [None]:
# Function to extract company name and count parts from email address
def extract_company(email):
    match = re.search(r'@([^@]+)\.', email)  # Match text after @ and before .
    if match:
        domain = match.group(1)  # Get the domain
        parts = domain.split('.')
        if len(parts) > 2:
            company = parts[-2]
        elif len(parts) == 2:  # Check if there are more than 1 point
            company = parts[-1]  # Get text before the last point
        elif len(parts) == 1:  # Check if there is at least one point
            company = parts[0]  # Get text after the @ and before the first point
        else:
            company = domain  # If there's only one word after the @ symbol, consider it as the company
        return company.capitalize()  # Capitalize the first letter of the company name and return the count of parts
    return None

In [None]:
#method to save a dataset to csv into Google Drive
def save_dataframe_to_csv(dataframe, folder_path, file_name):
    """
    Save a DataFrame to a CSV file with a timestamped file name.

    Args:
    - dataframe: pandas DataFrame to be saved
    - folder_path: path to the folder where the CSV file will be saved

    Returns:
    - file_path: full path to the saved CSV file
    """
    # Generate the file name with the current date
    file_name = f'{file_name}.csv'
    file_path = os.path.join(folder_path, file_name)

    # Export the data to a CSV file with the generated file name
    dataframe.to_csv(file_path, index=False)

    return file_path

In [None]:
def get_top_100000(df):
    # Convert 'Date' column to datetime format, ensuring that timezone-aware datetime objects are converted to UTC
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce', utc=True)

    # Sort the DataFrame by 'Date' column in descending order
    df = df.sort_values(by='Date', ascending=False)

    # Get the first 100,000 rows
    df_top_100000 = df.head(100000)

    return df_top_100000

## Loading Data

In [None]:
# List files in the folder
files = os.listdir(folder_path)

for file in files:
    print(file)

data info.txt
email_content_debian_gcc.csv
email_content_deity_apt.csv
email_content_pkg_systemd_maintainers.csv
email_content_debian_release.csv
email_content_debian_kernel.csv
email_content_debian_glibc.csv
email_content_debian_dpkg.csv
contributions_data_by_team.csv
oficial_debian_members_data.csv
email_content_dpkg_company_contributor.csv
email_content_glibc_company_contributor.csv
email_content_systemd_company_contributor.csv
email_content_apt_company_contributor.csv
email_content_gcc_company_contributor.csv
img_contributors_analysis
img_discussion_space_analysis


## Cleaning Data

***OFICIAL DEBIAN MEMBERS***

In [None]:
oficial_debian_members_df = pd.read_csv(folder_path + "oficial_debian_members_data.csv")

In [None]:
oficial_debian_members_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Person        1006 non-null   object
 1   Account Name  1006 non-null   object
 2   Since         1006 non-null   object
dtypes: object(3)
memory usage: 23.7+ KB


In [None]:
oficial_debian_members_df.head(5)

Unnamed: 0,Person,Account Name,Since
0,Aaron M. Ucko,ucko,2001-05-21
1,Abhijith PA,abhijith,2018-11-20
2,Adam D. Barratt,adsb,2008-10-14
3,Adam Majer,adamm,2003-09-18
4,Adam Powell,hazelsct,2000-11-05


In [None]:
oficial_debian_members_df.describe()

Unnamed: 0,Person,Account Name,Since
count,1006,1006,1006
unique,1006,1006,371
top,Aaron M. Ucko,ucko,(unknown)
freq,1,1,102


In [None]:
missing_values = oficial_debian_members_df.isnull().sum()
missing_values

Person          0
Account Name    0
Since           0
dtype: int64

*dataset already clean*

***CONTRIBUTORS TEAM***

In [None]:
contributions_team_df = pd.read_csv(folder_path + "contributions_data_by_team.csv")

In [None]:
contributions_team_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19477 entries, 0 to 19476
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Contributor   19477 non-null  object
 1   Team          19477 non-null  object
 2   Contribution  19477 non-null  object
 3   Since         19477 non-null  object
 4   Until         19477 non-null  object
dtypes: object(5)
memory usage: 760.9+ KB


In [None]:
contributions_team_df.head(5)

Unnamed: 0,Contributor,Team,Contribution,Since,Until
0,sudip,wiki.debian.org,wiki editor\n(extra info),January 2020,November 2023
1,sudip,mentors.debian.net,mentors.d.n package commenter\n(extra info),February 2020,February 2020
2,sudip,mentors.debian.net,mentors.d.n package uploader\n(extra info),May 2020,June 2020
3,sudip,bugs.debian.org,bts correspondents\n(extra info),October 2019,January 2024
4,sudip,lists.debian.org,poster,October 2019,January 2024


In [None]:
contributions_team_df.describe()

Unnamed: 0,Contributor,Team,Contribution,Since,Until
count,19477,19477,19477,19477,19477
unique,1499,27,58,337,296
top,rhonda,bugs.debian.org,poster,November 2007,January 2024
freq,67,4581,3337,1162,1104


In [None]:
missing_values = contributions_team_df.isnull().sum()
missing_values

Contributor     0
Team            0
Contribution    0
Since           0
Until           0
dtype: int64

*dataset already clean*

***DPKG***

In [None]:
dpkg_df = pd.read_csv(folder_path + "email_content_debian_dpkg.csv")

In [None]:
dpkg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132141 entries, 0 to 132140
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   To                132113 non-null  object
 1   Cc                63625 non-null   object
 2   Subject           132007 non-null  object
 3   Content           127923 non-null  object
 4   From              132139 non-null  object
 5   Date              132123 non-null  object
 6   Message-id        132141 non-null  object
 7   Reply-to          28713 non-null   object
 8   References        86699 non-null   object
 9   Link              132141 non-null  object
 10  In-reply-to       88400 non-null   object
 11  Mail-followup-to  23614 non-null   object
dtypes: object(12)
memory usage: 12.1+ MB


In [None]:
dpkg_df.describe()

Unnamed: 0,To,Cc,Subject,Content,From,Date,Message-id,Reply-to,References,Link,In-reply-to,Mail-followup-to
count,132113,63625,132007,127923,132139,132123,132141,28713,86699,132141,88400,23614
unique,2785,2486,7002,11280,2122,11646,11877,1617,6915,11877,6732,974
top,debian-dpkg@lists.debian.org,debian-dpkg@lists.debian.org,Re: Triggers status?,This e-mail has been sent due to an upload to ...,owner@bugs.debian.org(Debian Bug Tracking System),"Sun, 25 Apr 2004 11:33:11 -0700",<[🔎]87ipsstaht.fsf@frosties.localnet>,"Robert Millan <rmh@debian.org>,291939@bugs.deb...",<[🔎]20091017104643.GA6645@progeny.tock>,https://lists.debian.org/debian-dpkg/2011/debi...,<[🔎]20091017104643.GA6645@progeny.tock>,debian-dpkg@lists.debian.org
freq,30320,9691,1022,645,20636,125,44,302,252,44,252,4989


In [None]:
dpkg_df['From'].value_counts()

owner@bugs.debian.org(Debian Bug Tracking System)            20636
Guillem Jover <guillem@debian.org>                            9238
Raphael Hertzog <hertzog@debian.org>                          6167
Scott James Remnant <scott@netsplit.com>                      3903
Debian FTP Masters <ftpmaster@ftp-master.debian.org>          3694
                                                             ...  
"Oration T. Dumbbell" <Woodlands|woodlands@earthlink.com>        1
"John Smith" <mailtail@swissinfo.org>                            1
Matthijs Mohlmann <matthijs@cacholong.nl>                        1
"Mercedes Swanson" <gjacob@mayfield.com>                         1
Sven Rudolph <sr1@sax.sax.de>                                    1
Name: From, Length: 2122, dtype: int64

In [None]:
missing_values = dpkg_df.isnull().sum()
missing_values

To                      28
Cc                   68516
Subject                134
Content               4218
From                     2
Date                    18
Message-id               0
Reply-to            103428
References           45442
Link                     0
In-reply-to          43741
Mail-followup-to    108527
dtype: int64

Remove all rows where the content is empty, as this suggest that no significant exchange was made on it.

*They could be auto generated emails, missent emails, spam email*

In [None]:
# Filter rows where the Content column is null
dpkg_df[dpkg_df['Content'].isnull()]

Unnamed: 0,To,Cc,Subject,Content,From,Date,Message-id,Reply-to,References,Link,In-reply-to,Mail-followup-to
105,Private@murphy.debian.org,,Bug#115553: you're really cute,,<rachel_cruz_algk@excite.com>,"Mon, 27 Jan 2003 18:23:09 +1100",<[🔎]000000c0ea70$abc48501$72444258@ccdpbnc.cys>,"<rachel_cruz_algk@excite.com>,115553@bugs.debi...",,https://lists.debian.org/debian-dpkg/2003/debi...,,
106,Randy@murphy.debian.org,,Bug#115553: you're really cute,,<rachel_cruz_ivip@excite.com>,"Tue, 28 Jan 2003 03:18:43 +0400",<[🔎]000301c6da34$bcb75482$66802417@lxpfdws.vly>,"<rachel_cruz_ivip@excite.com>,115553@bugs.debi...",,https://lists.debian.org/debian-dpkg/2003/debi...,,
223,Private@murphy.debian.org,,Bug#115553: you're really cute,,<rachel_cruz_algk@excite.com>,"Mon, 27 Jan 2003 18:23:09 +1100",<[🔎]000000c0ea70$abc48501$72444258@ccdpbnc.cys>,"<rachel_cruz_algk@excite.com>,115553@bugs.debi...",,https://lists.debian.org/debian-dpkg/2003/debi...,,
224,Randy@murphy.debian.org,,Bug#115553: you're really cute,,<rachel_cruz_ivip@excite.com>,"Tue, 28 Jan 2003 03:18:43 +0400",<[🔎]000301c6da34$bcb75482$66802417@lxpfdws.vly>,"<rachel_cruz_ivip@excite.com>,115553@bugs.debi...",,https://lists.debian.org/debian-dpkg/2003/debi...,,
332,Private@murphy.debian.org,,Bug#115553: you're really cute,,<rachel_cruz_algk@excite.com>,"Mon, 27 Jan 2003 18:23:09 +1100",<[🔎]000000c0ea70$abc48501$72444258@ccdpbnc.cys>,"<rachel_cruz_algk@excite.com>,115553@bugs.debi...",,https://lists.debian.org/debian-dpkg/2003/debi...,,
...,...,...,...,...,...,...,...,...,...,...,...,...
131942,debian-dpkg@lists.debian.org,,Re: Debian-dpkg,,Kimberly <oleggegadov7577@gmail.com>,"Fri, 17 Nov 2023 19:25:26 +0000",<[🔎]CAN3_tsZ8GxuN16mAU2+zu+-peAqTp0mLLFgrBD6Ht...,,,https://lists.debian.org/debian-dpkg/2023/debi...,,
131948,Michael Hudson-Doyle <michael.hudson@canonical...,,Re: Architecture variants for Debian / Ubuntu,,Michael Hudson-Doyle <michael.hudson@canonical...,"Thu, 23 Nov 2023 17:25:05 +1300",<[🔎]CAJ8wqtc79H2uvwwUVjPcxXU=Lvf6AJtALvxRk8eBg...,,<CAJ8wqtcBp-afTkXT+w-3gO_Kz2QRtpJH7hL9PAeiOiY2...,https://lists.debian.org/debian-dpkg/2023/debi...,<[🔎]ZUTvbT3iml-JS9v9@thunder.hadrons.org>,
131968,Michael Hudson-Doyle <michael.hudson@canonical...,,Re: Architecture variants for Debian / Ubuntu,,Michael Hudson-Doyle <michael.hudson@canonical...,"Thu, 23 Nov 2023 17:25:05 +1300",<[🔎]CAJ8wqtc79H2uvwwUVjPcxXU=Lvf6AJtALvxRk8eBg...,,<CAJ8wqtcBp-afTkXT+w-3gO_Kz2QRtpJH7hL9PAeiOiY2...,https://lists.debian.org/debian-dpkg/2023/debi...,<[🔎]ZUTvbT3iml-JS9v9@thunder.hadrons.org>,
131985,Michael Hudson-Doyle <michael.hudson@canonical...,,Re: Architecture variants for Debian / Ubuntu,,Michael Hudson-Doyle <michael.hudson@canonical...,"Thu, 23 Nov 2023 17:25:05 +1300",<[🔎]CAJ8wqtc79H2uvwwUVjPcxXU=Lvf6AJtALvxRk8eBg...,,<CAJ8wqtcBp-afTkXT+w-3gO_Kz2QRtpJH7hL9PAeiOiY2...,https://lists.debian.org/debian-dpkg/2023/debi...,<[🔎]ZUTvbT3iml-JS9v9@thunder.hadrons.org>,


In [None]:
# Filter rows where the From column is null
dpkg_df[dpkg_df['From'].isnull()]

Unnamed: 0,To,Cc,Subject,Content,From,Date,Message-id,Reply-to,References,Link,In-reply-to,Mail-followup-to
4477,172290@bugs.debian.org,,Bug#172290: fixed in 1.10.10,dannf@krebs:~$ dpkg -c\ndpkg-deb: --contents t...,,"Sat, 3 May 2003 14:40:47 -0600",<[🔎]20030503204047.GA433@hp.com>,"dannf@hp.com,172290@bugs.debian.org",,https://lists.debian.org/debian-dpkg/2003/debi...,,
4543,172290@bugs.debian.org,,Bug#172290: fixed in 1.10.10,dannf@krebs:~$ dpkg -c\ndpkg-deb: --contents t...,,"Sat, 3 May 2003 14:40:47 -0600",<[🔎]20030503204047.GA433@hp.com>,"dannf@hp.com,172290@bugs.debian.org",,https://lists.debian.org/debian-dpkg/2003/debi...,,


In [None]:
# Remove rows where the Content column is null
dpkg_df = dpkg_df.dropna(subset=['Content'])

In [None]:
# Remove rows where the Content column is null
dpkg_df = dpkg_df.dropna(subset=['From'])

In [None]:
missing_values = dpkg_df.isnull().sum()
missing_values

To                      28
Cc                   64913
Subject                 92
Content                  0
From                     0
Date                     0
Message-id               0
Reply-to            100453
References           41788
Link                     0
In-reply-to          40070
Mail-followup-to    104307
dtype: int64

In [None]:
# Keep meaningful columns
columns = ['Date', 'From','To', 'Subject', 'Content', 'Link']

# Reorder the columns
dpkg_df = dpkg_df.reindex(columns=columns)

In [None]:
dpkg_df.head(5)

Unnamed: 0,Date,From,To,Subject,Content,Link
0,01 Jan 2003 10:53:08 +0100,Sven Rudolph <sr1@sax.sax.de>,Thomas Dickey <dickey@herndon4.his.com>,Bug#157093: ncurses-base: xterm-mono is not us...,Thomas Dickey writes:\n> I looked at this a li...,https://lists.debian.org/debian-dpkg/2003/debi...
1,"Wed, 1 Jan 2003 07:02:01 -0500",Thomas Dickey <dickey@herndon4.his.com>,Sven Rudolph <sr1@sax.sax.de>,Bug#157093: ncurses-base: xterm-mono is not us...,"On Wed, Jan 01, 2003 at 10:53:08AM +0100, Sven...",https://lists.debian.org/debian-dpkg/2003/debi...
2,"Thu, 2 Jan 2003 00:49:18 +0000",Zefram <zefram@fysh.org>,submit@bugs.debian.org,Bug#174971: dpkg(8) should point the user towa...,Package: dpkg\nVersion: 1.10.9\nSeverity: mino...,https://lists.debian.org/debian-dpkg/2003/debi...
3,"Thu, 2 Jan 2003 01:13:10 +0000",Zefram <zefram@fysh.org>,submit@bugs.debian.org,Bug#174973: dpkg-query -W default output forma...,Package: dpkg\nVersion: 1.10.9\nTags: patch\n\...,https://lists.debian.org/debian-dpkg/2003/debi...
4,"Thu, 2 Jan 2003 01:52:16 +0000",Zefram <zefram@fysh.org>,submit@bugs.debian.org,Bug#174976: dpkg-query --showformat is poorly ...,Package: dpkg\nVersion: 1.10.9\nTags: patch\n\...,https://lists.debian.org/debian-dpkg/2003/debi...


In [None]:
# Apply the function to extract company names and create a new 'Company' column
dpkg_df['Company'] = dpkg_df['From'].apply(lambda x: extract_company(x))
dpkg_df['Receiver'] = dpkg_df['To'].astype(str).apply(lambda x: extract_company(x)).fillna('Unknown')

In [None]:
# Call the function with your DataFrame
dpkg_df = get_top_100000(dpkg_df)

dpkg_df



Unnamed: 0,Date,From,To,Subject,Content,Link,Company,Receiver
132140,2023-12-29 15:49:07+00:00,Niels Thykier <niels@thykier.net>,Dpkg-Maintainers <debian-dpkg@lists.debian.org>,Re: Proper way to do setcap in maintscript,Niels Thykier:,https://lists.debian.org/debian-dpkg/2023/debi...,Thykier,Debian
132122,2023-12-29 15:49:07+00:00,Niels Thykier <niels@thykier.net>,Dpkg-Maintainers <debian-dpkg@lists.debian.org>,Re: Proper way to do setcap in maintscript,Niels Thykier:,https://lists.debian.org/debian-dpkg/2023/debi...,Thykier,Debian
132139,2023-12-29 15:49:07+00:00,Niels Thykier <niels@thykier.net>,Dpkg-Maintainers <debian-dpkg@lists.debian.org>,Re: Proper way to do setcap in maintscript,Niels Thykier:,https://lists.debian.org/debian-dpkg/2023/debi...,Thykier,Debian
132070,2023-12-29 15:49:07+00:00,Niels Thykier <niels@thykier.net>,Dpkg-Maintainers <debian-dpkg@lists.debian.org>,Re: Proper way to do setcap in maintscript,Niels Thykier:,https://lists.debian.org/debian-dpkg/2023/debi...,Thykier,Debian
132136,2023-12-29 15:49:07+00:00,Niels Thykier <niels@thykier.net>,Dpkg-Maintainers <debian-dpkg@lists.debian.org>,Re: Proper way to do setcap in maintscript,Niels Thykier:,https://lists.debian.org/debian-dpkg/2023/debi...,Thykier,Debian
...,...,...,...,...,...,...,...,...
26431,2004-03-23 02:16:55+00:00,"""Craig Saldana"" <TVYOHTF@linuxmail.org>","45291@bugs.debian.org,47404@bugs.debian.org,48...",Bug#47404: Victory at last,Today is a new day for your residence. With le...,https://lists.debian.org/debian-dpkg/2004/debi...,Linuxmail,Debian
28869,2004-03-23 02:16:55+00:00,"""Craig Saldana"" <TVYOHTF@linuxmail.org>","45291@bugs.debian.org,47404@bugs.debian.org,48...",Bug#47404: Victory at last,Today is a new day for your residence. With le...,https://lists.debian.org/debian-dpkg/2004/debi...,Linuxmail,Debian
29301,2004-03-23 02:16:55+00:00,"""Craig Saldana"" <TVYOHTF@linuxmail.org>","45291@bugs.debian.org,47404@bugs.debian.org,48...",Bug#47404: Victory at last,Today is a new day for your residence. With le...,https://lists.debian.org/debian-dpkg/2004/debi...,Linuxmail,Debian
29264,2004-03-22 23:34:34+00:00,Steve Greenland <steveg@lsli.com>,Debian Bug Tracking System <submit@bugs.debian...,Bug#239489: dpkg: rejecting --slave for --remo...,Package: dpkg\nVersion: 1.10.19\nSeverity: nor...,https://lists.debian.org/debian-dpkg/2004/debi...,Lsli,Debian


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(dpkg_df, folder_path, 'email_content_dpkg_company_contributor')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/final/email_content_dpkg_company_contributor.csv


***GLIBC***

In [None]:
glibc_df = pd.read_csv(folder_path + "email_content_debian_glibc.csv")

In [None]:
glibc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143825 entries, 0 to 143824
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0.5      143825 non-null  int64  
 1   Unnamed: 0.4      128825 non-null  float64
 2   Unnamed: 0.3      118825 non-null  float64
 3   Unnamed: 0.2      76825 non-null   float64
 4   Unnamed: 0.1      70000 non-null   float64
 5   Unnamed: 0        50000 non-null   float64
 6   To                143811 non-null  object 
 7   Subject           143801 non-null  object 
 8   From              143825 non-null  object 
 9   Content           136562 non-null  object 
 10  Date              143825 non-null  object 
 11  Message-id        143825 non-null  object 
 12  Reply-to          94234 non-null   object 
 13  Link              143825 non-null  object 
 14  Cc                53167 non-null   object 
 15  In-reply-to       60878 non-null   object 
 16  References        77

In [None]:
# Filter out columns with 'Unnamed' in their names
columns_to_keep = glibc_df.filter(like='Unnamed').columns

# Drop the filtered columns
glibc_df = glibc_df.drop(columns=columns_to_keep)

In [None]:
glibc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143825 entries, 0 to 143824
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   To                143811 non-null  object
 1   Subject           143801 non-null  object
 2   From              143825 non-null  object
 3   Content           136562 non-null  object
 4   Date              143825 non-null  object
 5   Message-id        143825 non-null  object
 6   Reply-to          94234 non-null   object
 7   Link              143825 non-null  object
 8   Cc                53167 non-null   object
 9   In-reply-to       60878 non-null   object
 10  References        77263 non-null   object
 11  Mail-followup-to  17252 non-null   object
dtypes: object(12)
memory usage: 13.2+ MB


In [None]:
glibc_df.describe()

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Reply-to,Link,Cc,In-reply-to,References,Mail-followup-to
count,143811,143801,143825,136562,143825,143825,94234,143825,53167,60878,77263,17252
unique,2601,7795,1619,11202,11404,12164,3431,12165,1884,4434,6150,233
top,debian-glibc@lists.debian.org,cvs commit to glibc-package/debian by gotom,owner@bugs.debian.org(Debian Bug Tracking System),debian/patches/git-updates.diff: update from u...,"Sun, 31 Dec 2017 17:57:14 +0000",<[🔎]20030131074224.GR25075@umnh.utah.edu>,debian-glibc@lists.debian.org,https://lists.debian.org/debian-glibc/2003/deb...,debian-glibc@lists.debian.org,<E19rlSN-0002La-00@auric.debian.org>,<20170826140101.22147.82848@moszumanska.debian...,debian-glibc@lists.debian.org
freq,45416,2078,15732,358,150,31,12783,31,8735,182,171,13261


In [None]:
glibc_df['From'].value_counts()

owner@bugs.debian.org(Debian Bug Tracking System)       15732
"Debian Bug Tracking System" <owner@bugs.debian.org>    13830
Debian GLibc CVS Master <bcollins@debian.org>           12762
GOTO Masanori <gotom@debian.or.jp>                       8597
Debian FTP Masters <ftpmaster@ftp-master.debian.org>     7708
                                                        ...  
"Vera Engel" <NNQWHU@alinto.com>                            1
Mikael Sennerholm <mikan@debian.org>                        1
postmaster2@spb.de                                          1
Theodore Lytras <thlytras@gmail.com>                        1
<a00885tbgg@hotmail.com>                                    1
Name: From, Length: 1619, dtype: int64

In [None]:
missing_values = glibc_df.isnull().sum()
missing_values

To                      14
Subject                 24
From                     0
Content               7263
Date                     0
Message-id               0
Reply-to             49591
Link                     0
Cc                   90658
In-reply-to          82947
References           66562
Mail-followup-to    126573
dtype: int64

In [None]:
# Filter rows where the Content column is null
glibc_df[glibc_df['Content'].isnull()]

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Reply-to,Link,Cc,In-reply-to,References,Mail-followup-to
72,libc6-dev@packages.debian.org,你的家是“绿色”的吗？,tianxin@tianxinfur.com,,"Sat, 4 Jan 2003 20:34:08 +0800",<[🔎]E18UnRr-0004ad-00@gluck.debian.org>,tianxin@tianxinfur.com,https://lists.debian.org/debian-glibc/2003/deb...,,,,
358,164766@bugs.debian.org,Bug #164766 C3 no cmov fix does not work,Scott Ashcroft <scott.ashcroft@talk21.com>,,"Sat, 25 Jan 2003 14:47:54 +0000",<[🔎]3E32A39A.40101@talk21.com>,,https://lists.debian.org/debian-glibc/2003/deb...,debian-glibc@lists.debian.org,,,
366,Confidential@murphy.debian.org,Bug#147343: I'm still thinking about you!,<htis@tesser.com>,,"Mon, 27 Jan 2003 07:48:10 +0400",<[🔎]001611e3bc85$cbd11004$88383446@keedheu.cqc>,"<htis@tesser.com>,147343@bugs.debian.org",https://lists.debian.org/debian-glibc/2003/deb...,,,,
392,Scott@murphy.debian.org,Bug#108619: you're really cute,<rachel_cruz_ijvd@excite.com>,,"Tue, 28 Jan 2003 14:00:33 -0900",<[🔎]000301e4dd44$dda17353$34485412@vhudost.lex>,"<rachel_cruz_ijvd@excite.com>,108619@bugs.debi...",https://lists.debian.org/debian-glibc/2003/deb...,,,,
393,Private@murphy.debian.org,Bug#108619: you're really cute,<rachel_cruz_cetp@excite.com>,,"Tue, 28 Jan 2003 07:57:39 -0100",<[🔎]001711a5ec81$eae21480$28122645@wjmyjqv.jhg>,"<rachel_cruz_cetp@excite.com>,108619@bugs.debi...",https://lists.debian.org/debian-glibc/2003/deb...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
143638,debian-glibc@lists.debian.org,Inspiring Muharram Saatnya Indonesia Peduli Pe...,Yatim Mandiri <dakwah@humasyatimmandiri.org>,,"Wed, 04 Oct 2017 12:49:32 +0700",<[🔎]c61ad8963ca8117f3b89fd7ebcba9e54@swift.gen...,Yatim Mandiri <dakwah@humasyatimmandiri.org>,https://lists.debian.org/debian-glibc/2017/deb...,,,,
143643,878071@bugs.debian.org,Bug#878071: (no subject),Breno Leitao <brenohl@br.ibm.com>,,"Mon, 9 Oct 2017 12:01:23 -0300",<[🔎]3041f5f7-ba08-b175-36f6-ec629155940f@br.ib...,"Breno Leitao <brenohl@br.ibm.com>,878071@bugs....",https://lists.debian.org/debian-glibc/2017/deb...,,,<[🔎]150755756157.1660.15216069467522326921.rep...,
143711,debian-glibc@lists.debian.org,Inspiring Muharram Saatnya Indonesia Peduli Pe...,Yatim Mandiri <dakwah@humasyatimmandiri.org>,,"Wed, 04 Oct 2017 12:49:32 +0700",<[🔎]c61ad8963ca8117f3b89fd7ebcba9e54@swift.gen...,Yatim Mandiri <dakwah@humasyatimmandiri.org>,https://lists.debian.org/debian-glibc/2017/deb...,,,,
143716,878071@bugs.debian.org,Bug#878071: (no subject),Breno Leitao <brenohl@br.ibm.com>,,"Mon, 9 Oct 2017 12:01:23 -0300",<[🔎]3041f5f7-ba08-b175-36f6-ec629155940f@br.ib...,"Breno Leitao <brenohl@br.ibm.com>,878071@bugs....",https://lists.debian.org/debian-glibc/2017/deb...,,,<[🔎]150755756157.1660.15216069467522326921.rep...,


In [None]:
# Filter rows where the From column is null
glibc_df[glibc_df['From'].isnull()]

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Reply-to,Link,Cc,In-reply-to,References,Mail-followup-to


In [None]:
# Remove rows where the Content column is null
glibc_df = glibc_df.dropna(subset=['Content'])

In [None]:
# Remove rows where the Content column is null
glibc_df = glibc_df.dropna(subset=['From'])

In [None]:
missing_values = glibc_df.isnull().sum()
missing_values

To                       0
Subject                 22
From                     0
Content                  0
Date                     0
Message-id               0
Reply-to             48217
Link                     0
Cc                   83877
In-reply-to          76420
References           60282
Mail-followup-to    119315
dtype: int64

In [None]:
# Keep meaningful columns
columns = ['Date', 'From','To', 'Subject', 'Content', 'Link']

# Reorder the columns
glibc_df = glibc_df.reindex(columns=columns)

In [None]:
glibc_df.head(5)

Unnamed: 0,Date,From,To,Subject,Content,Link
0,"Wed, 01 Jan 2003 05:42:08 +0600",<a00885tbgg@hotmail.com>,Swellas@murphy.debian.org,Did I say that,"Hi !..\n\nPlease, please write again, hope you...",https://lists.debian.org/debian-glibc/2003/deb...
1,"Wed, 01 Jan 2003 12:33:10 -0600",owner@bugs.debian.org(Debian Bug Tracking System),GOTO Masanori <gotom@debian.or.jp>,Processed: Re: libc6: Broken __moddi3 implemen...,Processing commands for control@bugs.debian.or...,https://lists.debian.org/debian-glibc/2003/deb...
2,"Thu, 02 Jan 2003 03:17:50 +0900",GOTO Masanori <gotom@debian.or.jp>,156473@bugs.debian.org,Bug#156473: localede_DE@eurobroken/missing,"In glibc 2.3.1-8, it seems being fixed:\n\n\tg...",https://lists.debian.org/debian-glibc/2003/deb...
3,"Thu, 02 Jan 2003 03:11:35 +0900",GOTO Masanori <gotom@debian.or.jp>,167409@bugs.debian.org,Bug#167409: XEmacs 21.4.10 crashes with glibc ...,Is this bug fixed in 2.3.1-8?\n\n-- gotom,https://lists.debian.org/debian-glibc/2003/deb...
4,"Wed, 01 Jan 2003 12:48:09 -0600",owner@bugs.debian.org(Debian Bug Tracking System),GOTO Masanori <gotom@debian.or.jp>,Processed: Re: Bug#172439: Bug#156386: patch t...,Processing commands for control@bugs.debian.or...,https://lists.debian.org/debian-glibc/2003/deb...


In [None]:
# Apply the function to extract company names and create a new 'Company' column
glibc_df['Company'] = glibc_df['From'].apply(lambda x: extract_company(x))
glibc_df['Receiver'] = glibc_df['To'].astype(str).apply(lambda x: extract_company(x)).fillna('Unknown')

In [None]:
# Call the function with your DataFrame
glibc_df = get_top_100000(glibc_df)

glibc_df



Unnamed: 0,Date,From,To,Subject,Content,Link,Company,Receiver
74781,2023-12-30 11:36:57+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,"Aurelien Jarno <aurel32@debian.org>, GNU Libc ...",glibc_2.38-5_source.changes ACCEPTED into expe...,Thank you for your contribution to Debian.\n\n...,https://lists.debian.org/debian-glibc/2023/deb...,Debian,Debian
74573,2023-12-30 11:36:57+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,"Aurelien Jarno <aurel32@debian.org>, GNU Libc ...",glibc_2.38-5_source.changes ACCEPTED into expe...,Thank you for your contribution to Debian.\n\n...,https://lists.debian.org/debian-glibc/2023/deb...,Debian,Debian
74762,2023-12-30 11:36:57+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,"Aurelien Jarno <aurel32@debian.org>, GNU Libc ...",glibc_2.38-5_source.changes ACCEPTED into expe...,Thank you for your contribution to Debian.\n\n...,https://lists.debian.org/debian-glibc/2023/deb...,Debian,Debian
74647,2023-12-30 11:36:57+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,"Aurelien Jarno <aurel32@debian.org>, GNU Libc ...",glibc_2.38-5_source.changes ACCEPTED into expe...,Thank you for your contribution to Debian.\n\n...,https://lists.debian.org/debian-glibc/2023/deb...,Debian,Debian
74810,2023-12-30 11:36:57+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,"Aurelien Jarno <aurel32@debian.org>, GNU Libc ...",glibc_2.38-5_source.changes ACCEPTED into expe...,Thank you for your contribution to Debian.\n\n...,https://lists.debian.org/debian-glibc/2023/deb...,Debian,Debian
...,...,...,...,...,...,...,...,...
40516,2003-08-21 08:52:32+00:00,GOTO Masanori <gotom@debian.or.jp>,"debian-devel@lists.debian.org,debian-glibc@lis...",Re: Bits from the RM,"At Thu, 21 Aug 2003 00:17:27 +1000,\nAnthony T...",https://lists.debian.org/debian-glibc/2003/deb...,Or,Debian
35247,2003-08-21 08:52:32+00:00,GOTO Masanori <gotom@debian.or.jp>,"debian-devel@lists.debian.org,debian-glibc@lis...",Re: Bits from the RM,"At Thu, 21 Aug 2003 00:17:27 +1000,\nAnthony T...",https://lists.debian.org/debian-glibc/2003/deb...,Or,Debian
35717,2003-08-21 08:52:32+00:00,GOTO Masanori <gotom@debian.or.jp>,"debian-devel@lists.debian.org,debian-glibc@lis...",Re: Bits from the RM,"At Thu, 21 Aug 2003 00:17:27 +1000,\nAnthony T...",https://lists.debian.org/debian-glibc/2003/deb...,Or,Debian
33813,2003-08-21 08:52:32+00:00,GOTO Masanori <gotom@debian.or.jp>,"debian-devel@lists.debian.org,debian-glibc@lis...",Re: Bits from the RM,"At Thu, 21 Aug 2003 00:17:27 +1000,\nAnthony T...",https://lists.debian.org/debian-glibc/2003/deb...,Or,Debian


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(glibc_df, folder_path, 'email_content_glibc_company_contributor')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/final/email_content_glibc_company_contributor.csv


***SYSTEMD***

In [None]:
systemd_df = pd.read_csv(folder_path + "email_content_pkg_systemd_maintainers.csv")

In [None]:
systemd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22323 entries, 0 to 22322
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   To                0 non-null      float64
 1   Content           22300 non-null  object 
 2   Cc                0 non-null      float64
 3   Subject           22322 non-null  object 
 4   From              22323 non-null  object 
 5   Date              22323 non-null  object 
 6   Message-id        0 non-null      float64
 7   Reply-to          0 non-null      float64
 8   References        0 non-null      float64
 9   Link              22323 non-null  object 
 10  In-reply-to       0 non-null      float64
 11  Mail-followup-to  0 non-null      float64
dtypes: float64(7), object(5)
memory usage: 2.0+ MB


In [None]:
systemd_df.describe()

Unnamed: 0,To,Cc,Message-id,Reply-to,References,In-reply-to,Mail-followup-to
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,
std,,,,,,,
min,,,,,,,
25%,,,,,,,
50%,,,,,,,
75%,,,,,,,
max,,,,,,,


In [None]:
systemd_df['From'].value_counts()

Debian Bug Tracking System <owner@bugs.debian.org>      5749
Michael Biebl <biebl@debian.org>                        4037
Debian FTP Masters <ftpmaster@ftp-master.debian.org>    1426
Michael Stapelberg <stapelberg@debian.org>               350
Felipe Sateler <fsateler@debian.org>                     318
                                                        ... 
ENGIK <accpayables@kr-wontech.autos>                       1
File Received <sales@enterprise-mail1.asia>                1
Vallo Kallaste <m1na1se@posteo.ee>                         1
Sadi <sadiyumusak@gmail.com>                               1
Thomas Goirand <thomas@goirand.fr>                         1
Name: From, Length: 3275, dtype: int64

In [None]:
missing_values = systemd_df.isnull().sum()
missing_values

To                  22323
Content                23
Cc                  22323
Subject                 1
From                    0
Date                    0
Message-id          22323
Reply-to            22323
References          22323
Link                    0
In-reply-to         22323
Mail-followup-to    22323
dtype: int64

In [None]:
# Filter rows where the Content column is null
systemd_df[systemd_df['Content'].isnull()]

Unnamed: 0,To,Content,Cc,Subject,From,Date,Message-id,Reply-to,References,Link,In-reply-to,Mail-followup-to
1804,,,,"My dear friend,",Rustico Banku <lawyerbanku08@gmail.com>,Fri Aug 28 18:20:51 BST 2020,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
7660,,,,We offer 3% Loan respond to apply today,APPLY HERE FOR QUICK LOAN <jrios@aristas.co>,Fri Sep 2 08:19:47 BST 2016,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
7785,,,,Bug#789796: (no subject),Martin von Wittich <martin.von.wittich@iserv.eu>,Mon Sep 19 10:34:11 BST 2016,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
8864,,,,View Attached File For More Details,Dr. John Maloney <web4131@gmail.com>,Tue Apr 12 09:54:01 BST 2016,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
10389,,,,柠市重娦巳觉掎 来吧 332418点C0M邀您注冊嶺⑤8赢5⒏0提+K服...,谭萤 <370016066@qq.com>,Sun Dec 24 05:37:05 GMT 2017,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
10397,,,,rEwuTUAcUnbkJf ao门永利334378点C0Ｍ邀您...,习永蓓 <370016066@qq.com>,Tue Dec 26 05:25:44 GMT 2017,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
10808,,,,★★★★★≮澳門銀河116948點com－註冊即送38－釦釦：330081906 群號：51...,崔瑶芹 <2735158319@qq.com>,Tue Sep 5 17:00:05 BST 2017,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
10809,,,,★★★★★≮期待您的加入×澳門銀河116948點com－註冊即送38－釦釦：33008190...,习芷光 <2735158319@qq.com>,Wed Sep 6 07:24:33 BST 2017,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
10811,,,,★★★★★≤澳門新葡京116498點com精準資料＜一碼中特＞註冊即送38｜釦釦：78334...,颜瑜 <2735158319@qq.com>,Wed Sep 6 21:19:15 BST 2017,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,
10812,,,,★★★★★≤澳門銀河116948點com精準資料＜一碼中特＞註冊即送38｜釦釦：330081...,袁若翠 <2735158319@qq.com>,Thu Sep 7 10:40:32 BST 2017,,,,https://alioth-lists.debian.net/pipermail/pkg-...,,


In [None]:
# Filter rows where the From column is null
systemd_df[systemd_df['From'].isnull()]

Unnamed: 0,To,Content,Cc,Subject,From,Date,Message-id,Reply-to,References,Link,In-reply-to,Mail-followup-to


In [None]:
# Remove rows where the Content column is null
systemd_df = systemd_df.dropna(subset=['Content'])

In [None]:
# Remove rows where the Content column is null
systemd_df = systemd_df.dropna(subset=['From'])

In [None]:
missing_values = systemd_df.isnull().sum()
missing_values

To                  22300
Content                 0
Cc                  22300
Subject                 1
From                    0
Date                    0
Message-id          22300
Reply-to            22300
References          22300
Link                    0
In-reply-to         22300
Mail-followup-to    22300
dtype: int64

In [None]:
# Keep meaningful columns
columns = ['Date', 'From','To', 'Subject', 'Content', 'Link']

# Reorder the columns
systemd_df = systemd_df.reindex(columns=columns)

In [None]:
systemd_df.head(5)

Unnamed: 0,Date,From,To,Subject,Content,Link
0,Sun Mar 3 10:57:08 GMT 2019,Debian Bug Tracking System <owner@bugs.debian....,,Processed: Bug #923081 in systemd marked as pe...,Processing control commands:\n\n>tag -1 pendin...,https://alioth-lists.debian.net/pipermail/pkg-...
1,Sun Mar 3 16:20:40 GMT 2019,Santiago Vila <sanvila@debian.org>,,Bug#923674: systemd: FTBFS (failing tests),Package: src:systemd\nVersion: 240-2\nSeverity...,https://alioth-lists.debian.net/pipermail/pkg-...
2,Mon Mar 4 11:55:33 GMT 2019,Felipe Sateler <fsateler@debian.org>,,Bug#923674: systemd: FTBFS (failing tests),Control: severity -1 important.\n\nResetting s...,https://alioth-lists.debian.net/pipermail/pkg-...
3,Sat Mar 9 19:32:10 GMT 2019,Michael Biebl <biebl@debian.org>,,Bug#923674: systemd: FTBFS (failing tests),Control: tags -1 moreinfo unreproducible\n\n\n...,https://alioth-lists.debian.net/pipermail/pkg-...
4,Sat Mar 9 19:33:06 GMT 2019,Debian Bug Tracking System <owner@bugs.debian....,,Processed: Re: Bug#923674: systemd: FTBFS (fai...,Processing control commands:\n\n>tags -1 morei...,https://alioth-lists.debian.net/pipermail/pkg-...


In [None]:
# Apply the function to extract company names and create a new 'Company' column
systemd_df['Company'] = systemd_df['From'].apply(lambda x: extract_company(x))
systemd_df['Receiver'] = systemd_df['To'].astype(str).apply(lambda x: extract_company(x)).fillna('Unknown')

In [None]:
# Call the function with your DataFrame
systemd_df = get_top_100000(systemd_df)

systemd_df



Unnamed: 0,Date,From,To,Subject,Content,Link,Company,Receiver
6996,2023-12-31 17:57:10+00:00,Debian Bug Tracking System <owner@bugs.debian....,,Processed: archiving 1056135,Processing commands forcontrol at bugs.debian....,https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
6995,2023-12-31 17:54:04+00:00,Debian Bug Tracking System <owner@bugs.debian....,,Processed: tagging 1058880,Processing commands forcontrol at bugs.debian....,https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
6994,2023-12-31 17:54:03+00:00,Debian Bug Tracking System <owner@bugs.debian....,,"Processed: unarchiving 1056135, fixed 1056135 ...",Processing commands forcontrol at bugs.debian....,https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
6993,2023-12-31 17:18:05+00:00,Debian Bug Tracking System <owner@bugs.debian....,,Bug#1053872: marked as done (systemd with high...,"Your message dated Sun, 31 Dec 2023 18:14:47 +...",https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
6992,2023-12-31 17:06:03+00:00,Debian Bug Tracking System <owner@bugs.debian....,,Bug#1051843: marked as done (systemd: Restart=...,"Your message dated Sun, 31 Dec 2023 18:03:15 +...",https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
...,...,...,...,...,...,...,...,...
19873,2013-04-07 19:48:03+00:00,Michael Stapelberg <stapelberg@debian.org>,,[Pkg-systemd-maintainers] Bug#704923: systemd:...,Package: sysv-rc\nVersion: 2.88dsf-34\nSeverit...,https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
19326,2013-03-29 21:16:18+00:00,Michael Biebl <biebl@debian.org>,,[Pkg-systemd-maintainers] VIO on sparc / udev ...,"Hi Jurij,\n\nafaics you are one of the SPARC p...",https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
19325,2013-03-29 10:11:21+00:00,Michael Stapelberg <stapelberg@debian.org>,,[Pkg-systemd-maintainers] Bug#704197: Please r...,Package: lintian\nVersion: 2.5.10.4\nSeverity:...,https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown
19324,2013-03-20 23:57:09+00:00,Michael Stapelberg <stapelberg@debian.org>,,[Pkg-systemd-maintainers] Bug#703571: sysvinit...,Package: sysv-rc\nVersion: 2.88dsf-22.1\nSever...,https://alioth-lists.debian.net/pipermail/pkg-...,Debian,Unknown


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(systemd_df, folder_path, 'email_content_systemd_company_contributor')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/final/email_content_systemd_company_contributor.csv


***APT***

In [None]:
apt_df = pd.read_csv(folder_path + "email_content_deity_apt.csv")

In [None]:
apt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   To                99935 non-null   object
 1   Subject           99941 non-null   object
 2   From              100000 non-null  object
 3   Content           92222 non-null   object
 4   Date              100000 non-null  object
 5   Message-id        100000 non-null  object
 6   Reply-to          50491 non-null   object
 7   Link              100000 non-null  object
 8   References        62354 non-null   object
 9   In-reply-to       32529 non-null   object
 10  Cc                30152 non-null   object
 11  Mail-followup-to  10043 non-null   object
dtypes: object(12)
memory usage: 9.2+ MB


In [None]:
apt_df.describe()

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Reply-to,Link,References,In-reply-to,Cc,Mail-followup-to
count,99935,99941,100000,92222,100000,100000,50491,100000,62354,32529,30152,10043
unique,2499,6214,2080,8081,8826,8856,3425,8859,5218,2499,951,600
top,Julian Andres Klode <jak@debian.org>,Re: non-essential adduser poses problems to pu...,owner@bugs.debian.org(Debian Bug Tracking System),"Hello,","Sat, 26 Nov 2016 00:06:07 +0000",<[🔎]1451561693.3976.2.camel@gmail.com>,contact@ontdek.debesteleverancier.be,https://lists.debian.org/deity/2015/deity-2015...,<[🔎]20170627194055.GA8639@debian.org>,<[🔎]20170627194055.GA8639@debian.org>,deity@lists.debian.org,deity@lists.debian.org
freq,12403,337,18873,116,46,31,296,31,80,80,12807,441


In [None]:
apt_df['From'].value_counts()

owner@bugs.debian.org(Debian Bug Tracking System)       18873
"Debian Bug Tracking System" <owner@bugs.debian.org>     8922
Julian Andres Klode <jak@debian.org>                     8237
Debian FTP Masters <ftpmaster@ftp-master.debian.org>     7582
David Kalnischkies <david@kalnischkies.de>               5799
                                                        ...  
MR BALOYI <victorbaloyi17@gmail.com>                        1
matlink <matlink@matlink.fr>                                1
Min kim kang <Minkimkang01@gmail.com>                       1
Трофимов Даниил <rubin@googlemail.com>                      1
Miles & More <stadtarchiv@soest.de>                         1
Name: From, Length: 2080, dtype: int64

In [None]:
missing_values = apt_df.isnull().sum()
missing_values

To                     65
Subject                59
From                    0
Content              7778
Date                    0
Message-id              0
Reply-to            49509
Link                    0
References          37646
In-reply-to         67471
Cc                  69848
Mail-followup-to    89957
dtype: int64

In [None]:
# Filter rows where the Content column is null
apt_df[apt_df['Content'].isnull()]

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Reply-to,Link,References,In-reply-to,Cc,Mail-followup-to
15,Brian Thompson <brian@hashvault.io>,Re: First Packaging Con 2021,Wolf Vollprecht <w.vollprecht@gmail.com>,,"Wed, 26 Jul 2023 16:54:16 +0200",<[🔎]CADDUUuCQzxfUQ1qQXgDHyybkq5zy9cpmM2bg8Lmv+...,,https://lists.debian.org/deity/2023/deity-2023...,<CADDUUuBFozHnYU_2+-WJ57u6wk_BC_jO_tActvT7q9Gd...,<2f5e9ae00394e39e9a07ad746801f4e816a203c0.came...,deity@lists.debian.org,
29,deity@lists.debian.org,Pozdravy z Londýna,Andrew Light <light012@ozkpaj.com>,,"Sun, 16 Jul 2023 19:53:20 +0000",<[🔎]AM6PR02MB4787E0DF3A03F0526E7DC050FD3AA@AM6...,,https://lists.debian.org/deity/2023/deity-2023...,,,,
46,Brian Thompson <brian@hashvault.io>,Re: First Packaging Con 2021,Wolf Vollprecht <w.vollprecht@gmail.com>,,"Wed, 26 Jul 2023 16:54:16 +0200",<[🔎]CADDUUuCQzxfUQ1qQXgDHyybkq5zy9cpmM2bg8Lmv+...,,https://lists.debian.org/deity/2023/deity-2023...,<CADDUUuBFozHnYU_2+-WJ57u6wk_BC_jO_tActvT7q9Gd...,<2f5e9ae00394e39e9a07ad746801f4e816a203c0.came...,deity@lists.debian.org,
76,Brian Thompson <brian@hashvault.io>,Re: First Packaging Con 2021,Wolf Vollprecht <w.vollprecht@gmail.com>,,"Wed, 26 Jul 2023 16:54:16 +0200",<[🔎]CADDUUuCQzxfUQ1qQXgDHyybkq5zy9cpmM2bg8Lmv+...,,https://lists.debian.org/deity/2023/deity-2023...,<CADDUUuBFozHnYU_2+-WJ57u6wk_BC_jO_tActvT7q9Gd...,<2f5e9ae00394e39e9a07ad746801f4e816a203c0.came...,deity@lists.debian.org,
102,Brian Thompson <brian@hashvault.io>,Re: First Packaging Con 2021,Wolf Vollprecht <w.vollprecht@gmail.com>,,"Wed, 26 Jul 2023 16:54:16 +0200",<[🔎]CADDUUuCQzxfUQ1qQXgDHyybkq5zy9cpmM2bg8Lmv+...,,https://lists.debian.org/deity/2023/deity-2023...,<CADDUUuBFozHnYU_2+-WJ57u6wk_BC_jO_tActvT7q9Gd...,<2f5e9ae00394e39e9a07ad746801f4e816a203c0.came...,deity@lists.debian.org,
...,...,...,...,...,...,...,...,...,...,...,...,...
99961,<apt@packages.debian.org>,Avis de paiement,"""JOSE L. MIGUEL"" <f4ewzaesezz@btconnect.com>",,"Mon, 20 Jul 2015 04:59:37 +0100",<[🔎]5752747745562@37.234.24.185.rdns.servebyte...,<joseluismail@anwaltbumail.com>,https://lists.debian.org/deity/2015/deity-2015...,,,,
99962,deity@lists.debian.org,apt-get and ntlm proxy,Юра Фролов <prostovrn@gmail.com>,,"Thu, 23 Jul 2015 11:04:29 +0300",<[🔎]CADxcNEO6bb42aXDzY+hAOL_JJYoj+dW=Mmp3Ye7fT...,,https://lists.debian.org/deity/2015/deity-2015...,,,,
99992,<deity@lists.debian.org>,Avis de paiement,"""JOSE L. MIGUEL"" <f3ewzapqptc@btconnect.com>",,"Mon, 20 Jul 2015 02:39:32 +0100",<[🔎]4224739326140@37.234.24.185.rdns.servebyte...,<joseluismail@anwaltbumail.com>,https://lists.debian.org/deity/2015/deity-2015...,,,,
99993,<apt@packages.debian.org>,Avis de paiement,"""JOSE L. MIGUEL"" <f4ewzaesezz@btconnect.com>",,"Mon, 20 Jul 2015 04:59:37 +0100",<[🔎]5752747745562@37.234.24.185.rdns.servebyte...,<joseluismail@anwaltbumail.com>,https://lists.debian.org/deity/2015/deity-2015...,,,,


In [None]:
# Filter rows where the From column is null
apt_df[apt_df['From'].isnull()]

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Reply-to,Link,References,In-reply-to,Cc,Mail-followup-to


In [None]:
# Remove rows where the Content column is null
apt_df = apt_df.dropna(subset=['Content'])

In [None]:
# Remove rows where the Content column is null
apt_df = apt_df.dropna(subset=['From'])

In [None]:
missing_values = apt_df.isnull().sum()
missing_values

To                     58
Subject                16
From                    0
Content                 0
Date                    0
Message-id              0
Reply-to            45617
Link                    0
References          31536
In-reply-to         61002
Cc                  62495
Mail-followup-to    82179
dtype: int64

In [None]:
# Keep meaningful columns
columns = ['Date', 'From','To', 'Subject', 'Content', 'Link']

# Reorder the columns
apt_df = apt_df.reindex(columns=columns)

In [None]:
apt_df.head(5)

Unnamed: 0,Date,From,To,Subject,Content,Link
0,"Sat, 22 Jul 2023 18:58:45 +0200",Jörn Heissler <debbugs2023-07@wulf.eu.org>,Debian Bug Tracking System <submit@bugs.debian...,"Bug#1041732: ""N: Missing Signed-By in the sour...",Package: apt\nVersion: 2.7.2\nSeverity: minor\...,https://lists.debian.org/deity/2023/deity-2023...
1,"Sat, 22 Jul 2023 20:39:05 +0000","""Debian Bug Tracking System"" <owner@bugs.debia...",David Kalnischkies <david@kalnischkies.de>,Bug#1041708: marked as done (apt: Manpages hav...,"Your message dated Sat, 22 Jul 2023 22:34:40 +...",https://lists.debian.org/deity/2023/deity-2023...
2,"Sat, 22 Jul 2023 20:39:07 +0000","""Debian Bug Tracking System"" <owner@bugs.debia...",David Kalnischkies <david@kalnischkies.de>,"Bug#1041732: marked as done (""N: Missing Signe...","Your message dated Sat, 22 Jul 2023 22:36:41 +...",https://lists.debian.org/deity/2023/deity-2023...
3,"Sat, 22 Jul 2023 23:14:43 -0400",Allan Wind <allan@yaxto.com>,Debian Bug Tracking System <submit@bugs.debian...,Bug#1041750: apt-get changelog nvidia-driver f...,Package: apt\nVersion: 2.6.1\nSeverity: normal...,https://lists.debian.org/deity/2023/deity-2023...
4,"Sun, 23 Jul 2023 13:43:02 +0200",David Kalnischkies <david@kalnischkies.de>,"Allan Wind <allan@yaxto.com>,1041750@bugs.debi...",Bug#1041750: apt-get changelog nvidia-driver f...,"On Sat, Jul 22, 2023 at 11:14:43PM -0400, Alla...",https://lists.debian.org/deity/2023/deity-2023...


In [None]:
# Apply the function to extract company names and create a new 'Company' column
apt_df['Company'] = apt_df['From'].apply(lambda x: extract_company(x))
apt_df['Receiver'] = apt_df['To'].astype(str).apply(lambda x: extract_company(x)).fillna('Unknown')

In [None]:
# Call the function with your DataFrame
apt_df = get_top_100000(apt_df)

apt_df

Unnamed: 0,Date,From,To,Subject,Content,Link,Company,Receiver
2536,2023-12-29 14:18:30+00:00,Patrice Duroux <patrice.duroux@gmail.com>,Debian Bug Tracking System <submit@bugs.debian...,Bug#1059629: apt: doing something like 'apt -t...,Package: apt\nVersion: 2.7.7\nSeverity: wishli...,https://lists.debian.org/deity/2023/deity-2023...,Gmail,Debian
2474,2023-12-29 14:18:30+00:00,Patrice Duroux <patrice.duroux@gmail.com>,Debian Bug Tracking System <submit@bugs.debian...,Bug#1059629: apt: doing something like 'apt -t...,Package: apt\nVersion: 2.7.7\nSeverity: wishli...,https://lists.debian.org/deity/2023/deity-2023...,Gmail,Debian
2949,2023-12-29 14:18:30+00:00,Patrice Duroux <patrice.duroux@gmail.com>,Debian Bug Tracking System <submit@bugs.debian...,Bug#1059629: apt: doing something like 'apt -t...,Package: apt\nVersion: 2.7.7\nSeverity: wishli...,https://lists.debian.org/deity/2023/deity-2023...,Gmail,Debian
2652,2023-12-29 14:18:30+00:00,Patrice Duroux <patrice.duroux@gmail.com>,Debian Bug Tracking System <submit@bugs.debian...,Bug#1059629: apt: doing something like 'apt -t...,Package: apt\nVersion: 2.7.7\nSeverity: wishli...,https://lists.debian.org/deity/2023/deity-2023...,Gmail,Debian
2704,2023-12-29 14:18:30+00:00,Patrice Duroux <patrice.duroux@gmail.com>,Debian Bug Tracking System <submit@bugs.debian...,Bug#1059629: apt: doing something like 'apt -t...,Package: apt\nVersion: 2.7.7\nSeverity: wishli...,https://lists.debian.org/deity/2023/deity-2023...,Gmail,Debian
...,...,...,...,...,...,...,...,...
86188,2014-09-07 12:47:03+00:00,merlin <phil-deb1.merlin@laposte.net>,Debian Bug Tracking System <submit@bugs.debian...,Bug#760739: /usr/bin/apt-get: autoremove propo...,Package: apt\nVersion: 1.0.7\nSeverity: normal...,https://lists.debian.org/deity/2014/deity-2014...,Laposte,Debian
86059,2014-09-07 12:47:03+00:00,merlin <phil-deb1.merlin@laposte.net>,Debian Bug Tracking System <submit@bugs.debian...,Bug#760739: /usr/bin/apt-get: autoremove propo...,Package: apt\nVersion: 1.0.7\nSeverity: normal...,https://lists.debian.org/deity/2014/deity-2014...,Laposte,Debian
86058,2014-09-06 14:47:24+00:00,Antonio Terceiro <terceiro@debian.org>,David Kalnischkies <david@kalnischkies.de>,Bug#755040: apt: fix tests failing on ci.debia...,"Hi,\n\nOn Sat, Sep 06, 2014 at 12:52:01PM +020...",https://lists.debian.org/deity/2014/deity-2014...,Debian,Kalnischkies
86057,2014-09-06 10:54:09+00:00,owner@bugs.debian.org(Debian Bug Tracking System),David Kalnischkies <david@kalnischkies.de>,Processed: Re: Bug#755040: apt: fix tests fail...,Processing control commands:\n\n> retitle -1 f...,https://lists.debian.org/deity/2014/deity-2014...,Debian,Kalnischkies


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(apt_df, folder_path, 'email_content_apt_company_contributor')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/final/email_content_apt_company_contributor.csv


***GCC***

In [None]:
gcc_df = pd.read_csv(folder_path + "email_content_debian_gcc.csv")

In [None]:
gcc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   To                100000 non-null  object
 1   Subject           100000 non-null  object
 2   From              100000 non-null  object
 3   Content           98002 non-null   object
 4   Date              100000 non-null  object
 5   Message-id        100000 non-null  object
 6   Link              100000 non-null  object
 7   Reply-to          52460 non-null   object
 8   References        31722 non-null   object
 9   Cc                41955 non-null   object
 10  In-reply-to       11606 non-null   object
 11  Mail-followup-to  181 non-null     object
dtypes: object(12)
memory usage: 9.2+ MB


In [None]:
gcc_df.describe()

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Link,Reply-to,References,Cc,In-reply-to,Mail-followup-to
count,100000,100000,100000,98002,100000,100000,100000,52460,31722,41955,11606,181
unique,931,6764,607,7798,7985,8033,8033,1667,2241,546,754,15
top,gcc-testresults@gcc.gnu.org,[Bug middle-end/323] optimized code gives stra...,Debian FTP Masters <ftpmaster@ftp-master.debia...,Processing control commands:\n\n> severity 975...,"Thu, 28 Jan 2021 17:16:41 +0000",<[🔎]E1qQO4t-00FLEk-QE@usper.debian.org>,https://lists.debian.org/debian-gcc/2023/debia...,Matthias Klose <doko@debian.org>,<bug-323-5724@http.gcc.gnu.org/bugzilla/>,debian-gcc@lists.debian.org,<bug-323-5724@http.gcc.gnu.org/bugzilla/>,"Matthias Klose <doko@debian.org>,\t""debian-por..."
freq,24362,331,31182,108,75,30,30,24362,331,29447,331,24


In [None]:
gcc_df['From'].value_counts()

Debian FTP Masters <ftpmaster@ftp-master.debian.org>             31182
Matthias Klose <doko@debian.org>                                 27978
"Debian Bug Tracking System" <owner@bugs.debian.org>             17054
debian-bts-link@lists.debian.org                                  1289
Debian testing autoremoval watch <noreply@release.debian.org>     1075
                                                                 ...  
Stephen Frost <sfrost@snowman.net>                                   1
Pavel Cernohorsky <pavel.cernohorsky@appeartv.com>                   1
Michelle Hansen <michelle@updatecomercial.com>                       1
Michał Malicki  | Sklepy Internetowe <bok@platnot.sejny.pl>          1
Michael Stone <mstone@debian.org>                                    1
Name: From, Length: 607, dtype: int64

In [None]:
missing_values = gcc_df.isnull().sum()
missing_values

To                      0
Subject                 0
From                    0
Content              1998
Date                    0
Message-id              0
Link                    0
Reply-to            47540
References          68278
Cc                  58045
In-reply-to         88394
Mail-followup-to    99819
dtype: int64

In [None]:
# Filter rows where the Content column is null
gcc_df[gcc_df['Content'].isnull()]

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Link,Reply-to,References,Cc,In-reply-to,Mail-followup-to
8,debian-gcc@lists.debian.org,ya sabían esto en la oficina?,Recursos Humanos <rhia@industrialsm.com>,,"Mon, 17 Jul 2023 14:13:43 -0600",<[🔎]Qesnww22CyIBlzxkJMYdDVVXJ9jHDsOFKaBaZpNV56...,https://lists.debian.org/debian-gcc/2023/debia...,,,,,
210,debian-gcc@lists.debian.org,ya sabían esto en la oficina?,Recursos Humanos <rhia@industrialsm.com>,,"Mon, 17 Jul 2023 14:13:43 -0600",<[🔎]Qesnww22CyIBlzxkJMYdDVVXJ9jHDsOFKaBaZpNV56...,https://lists.debian.org/debian-gcc/2023/debia...,,,,,
411,debian-gcc@lists.debian.org,ya sabían esto en la oficina?,Recursos Humanos <rhia@industrialsm.com>,,"Mon, 17 Jul 2023 14:13:43 -0600",<[🔎]Qesnww22CyIBlzxkJMYdDVVXJ9jHDsOFKaBaZpNV56...,https://lists.debian.org/debian-gcc/2023/debia...,,,,,
604,debian-gcc@lists.debian.org,ya sabían esto en la oficina?,Recursos Humanos <rhia@industrialsm.com>,,"Mon, 17 Jul 2023 14:13:43 -0600",<[🔎]Qesnww22CyIBlzxkJMYdDVVXJ9jHDsOFKaBaZpNV56...,https://lists.debian.org/debian-gcc/2023/debia...,,,,,
782,debian-gcc@lists.debian.org,ya sabían esto en la oficina?,Recursos Humanos <rhia@industrialsm.com>,,"Mon, 17 Jul 2023 14:13:43 -0600",<[🔎]Qesnww22CyIBlzxkJMYdDVVXJ9jHDsOFKaBaZpNV56...,https://lists.debian.org/debian-gcc/2023/debia...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
99804,"""debian-gcc@lists.debian.org"" <debian-gcc@list...","Schudnij 6kg w 2dni-bez wysiku,bez wyrzeczeń",Paweł Gach-dietetyk <bok@demps.pisz.pl>,,"Fri, 19 Jul 2019 11:22:31 +0200",<[🔎]43BQf18ixFdtqs7QpOQJbgfV98zCxcO1ZeKplqSa2r...,https://lists.debian.org/debian-gcc/2019/debia...,,,,,
99860,"""debian-gcc@lists.debian.org"" <debian-gcc@list...",Szukasz administratora FanPage? Zobacz!,Kamil Mysik - nt. Twojego Facebook’a. <office@...,,"Tue, 9 Jul 2019 10:04:27 +0200",<[🔎]K5zZ0DqCZFaV5wZU8KjxGUrcsYbjnuobzreeat22I@...,https://lists.debian.org/debian-gcc/2019/debia...,,,,,
99907,debian-gcc@lists.debian.org,mitigate libffi6-3.2.1-6/9+b1,Ilari Jääskeläinen <voidwalker6667@gmail.com>,,"Thu, 18 Jul 2019 10:40:14 +0300",<[🔎]CALfL8vzXhhnJdsCq1wA=zcG-NbRZEJQCm9vyPtdbk...,https://lists.debian.org/debian-gcc/2019/debia...,,,,,
99909,"""debian-gcc@lists.debian.org"" <debian-gcc@list...","Schudnij 6kg w 2dni-bez wysiku,bez wyrzeczeń",Paweł Gach-dietetyk <bok@demps.pisz.pl>,,"Fri, 19 Jul 2019 11:22:31 +0200",<[🔎]43BQf18ixFdtqs7QpOQJbgfV98zCxcO1ZeKplqSa2r...,https://lists.debian.org/debian-gcc/2019/debia...,,,,,


In [None]:
# Filter rows where the From column is null
gcc_df[gcc_df['From'].isnull()]

Unnamed: 0,To,Subject,From,Content,Date,Message-id,Link,Reply-to,References,Cc,In-reply-to,Mail-followup-to


In [None]:
# Remove rows where the Content column is null
gcc_df = gcc_df.dropna(subset=['Content'])

In [None]:
# Remove rows where the Content column is null
gcc_df = gcc_df.dropna(subset=['From'])

In [None]:
missing_values = gcc_df.isnull().sum()
missing_values

To                      0
Subject                 0
From                    0
Content                 0
Date                    0
Message-id              0
Link                    0
Reply-to            46358
References          66704
Cc                  56151
In-reply-to         86510
Mail-followup-to    97821
dtype: int64

In [None]:
# Keep meaningful columns
columns = ['Date', 'From','To', 'Subject', 'Content', 'Link']

# Reorder the columns
gcc_df = gcc_df.reindex(columns=columns)

In [None]:
gcc_df.head(5)

Unnamed: 0,Date,From,To,Subject,Content,Link
0,"Sat, 15 Jul 2023 04:39:08 +0000",Debian testing autoremoval watch <noreply@rele...,cvise@packages.debian.org,cvise is marked for autoremoval from testing,cvise 2.8.0-1 is marked for autoremoval from t...,https://lists.debian.org/debian-gcc/2023/debia...
1,"Sat, 15 Jul 2023 07:39:03 +0000","""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Bug#1037615: marked as done (cvise: ftbfs with...,"Your message dated Sat, 15 Jul 2023 09:34:29 +...",https://lists.debian.org/debian-gcc/2023/debia...
2,"Sat, 15 Jul 2023 08:09:16 +0000",Debian FTP Masters <ftpmaster@ftp-master.debia...,debian-gcc@lists.debian.org,Processing of gcc-12-cross-mipsen_4+c1_source....,gcc-12-cross-mipsen_4+c1_source.changes upload...,https://lists.debian.org/debian-gcc/2023/debia...
3,"Sat, 15 Jul 2023 08:51:07 +0000",Debian FTP Masters <ftpmaster@ftp-master.debia...,Debian GCC Maintainers <debian-gcc@lists.debia...,gcc-12-cross-mipsen_4+c1_source.changes ACCEPT...,Thank you for your contribution to Debian.\n\n...,https://lists.debian.org/debian-gcc/2023/debia...
4,"Sat, 15 Jul 2023 11:00:42 +0000",Debian FTP Masters <ftpmaster@ftp-master.debia...,Debian GCC Maintainers <debian-gcc@lists.debia...,gcc-11-cross-mipsen_6+c1_arm64.changes ACCEPTE...,Thank you for your contribution to Debian.\n\n...,https://lists.debian.org/debian-gcc/2023/debia...


In [None]:
# Apply the function to extract company names and create a new 'Company' column
gcc_df['Company'] = gcc_df['From'].apply(lambda x: extract_company(x))
gcc_df['Receiver'] = gcc_df['To'].astype(str).apply(lambda x: extract_company(x)).fillna('Unknown')

In [None]:
# Call the function with your DataFrame
gcc_df = get_top_100000(gcc_df)

gcc_df

Unnamed: 0,Date,From,To,Subject,Content,Link,Company,Receiver
8810,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8985,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8945,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8924,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8969,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
...,...,...,...,...,...,...,...,...
80096,2018-09-08 15:13:37+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,debian-gcc@lists.debian.org,Processing of gcc-snapshot_20180908-1_source.c...,gcc-snapshot_20180908-1_source.changes uploade...,https://lists.debian.org/debian-gcc/2018/debia...,Debian,Debian
80025,2018-09-08 15:13:37+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,debian-gcc@lists.debian.org,Processing of gcc-snapshot_20180908-1_source.c...,gcc-snapshot_20180908-1_source.changes uploade...,https://lists.debian.org/debian-gcc/2018/debia...,Debian,Debian
80024,2018-09-08 13:36:06+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Bertrand Marc <bmarc@debian.org>,Processed: your mail,Processing commands for control@bugs.debian.or...,https://lists.debian.org/debian-gcc/2018/debia...,Debian,Debian
80095,2018-09-08 13:36:06+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Bertrand Marc <bmarc@debian.org>,Processed: your mail,Processing commands for control@bugs.debian.or...,https://lists.debian.org/debian-gcc/2018/debia...,Debian,Debian


In [None]:
gcc_df.head(100)

Unnamed: 0,Date,From,To,Subject,Content,Link,Company,Receiver
8810,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8985,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8945,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8924,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8969,2023-12-31 09:36:04+00:00,"""Debian Bug Tracking System"" <owner@bugs.debia...",Matthias Klose <doko@debian.org>,Processed: Re: gcc-13: Please build gcc with -...,Processing control commands:\n\n> tags -1 + mo...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
...,...,...,...,...,...,...,...,...
8974,2023-12-30 07:25:16+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,debian-gcc@lists.debian.org,Processing of python-pebble_5.0.6-1_source.cha...,python-pebble_5.0.6-1_source.changes uploaded ...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8895,2023-12-30 07:25:16+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,debian-gcc@lists.debian.org,Processing of python-pebble_5.0.6-1_source.cha...,python-pebble_5.0.6-1_source.changes uploaded ...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8941,2023-12-30 07:25:16+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,debian-gcc@lists.debian.org,Processing of python-pebble_5.0.6-1_source.cha...,python-pebble_5.0.6-1_source.changes uploaded ...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian
8481,2023-12-30 07:25:16+00:00,Debian FTP Masters <ftpmaster@ftp-master.debia...,debian-gcc@lists.debian.org,Processing of python-pebble_5.0.6-1_source.cha...,python-pebble_5.0.6-1_source.changes uploaded ...,https://lists.debian.org/debian-gcc/2023/debia...,Debian,Debian


In [None]:
# Save the DataFrame to a CSV file
file_path = save_dataframe_to_csv(gcc_df, folder_path, 'email_content_gcc_company_contributor')
print("File saved to:", file_path)

File saved to: /content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/data/final/email_content_gcc_company_contributor.csv
