In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd
import torch

from utils import preprocess_text, labelnum

In [3]:
DATA_PATH = Path('./data/')
SENTIMENT_MODEL_PATH = Path('./models/sentiment/')  

# Fine-tuning

In [29]:
gh = pd.read_csv(DATA_PATH / 'github_gold.csv', sep=';')
gh.head()

In [30]:
gh = gh[['Text', 'Polarity']]
gh.rename(columns={"Text": "text", 'Polarity': 'label'}, inplace=True)

gh.head()

In [31]:
jira = pd.read_csv(DATA_PATH / 'JIRA.csv')
jira.head()

In [32]:
jira = jira[['sentence', 'oracle']]
jira.rename(columns={"sentence": "text", 'oracle': 'label'}, inplace=True)
jira_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
jira['label'] = jira['label'].replace(jira_labels)

jira.head()

In [33]:
so = pd.read_csv(DATA_PATH / 'StackOverflow.csv')
so.head()

In [34]:
so = so[['text', 'oracle']]
so.rename(columns={'oracle': 'label'}, inplace=True)
so_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
so['label'] = so['label'].replace(so_labels)

so.head()

In [35]:
so2 = pd.read_csv(DATA_PATH / 'NewData.csv')
so2.head()

In [36]:
so2 = so2[['text', 'oracle']]
so2.rename(columns={'oracle': 'label'}, inplace=True)
so2['label'] = so2['label'].apply(str.lower)

so2.head()

In [37]:
api = pd.read_excel(DATA_PATH / 'BenchmarkUddinSO-ConsoliatedAspectSentiment.xls')
api.head()

In [38]:
api = api[['sent', 'ManualLabel']]
api.rename(columns={"sent": "text", 'ManualLabel': 'label'}, inplace=True)
api_labels = {
    'n': 'negative', 
    'o': 'neutral', 
    'p': 'positive' 
}
api['label'] = api['label'].replace(api_labels)

api.head()

In [41]:
finetuning = pd.concat([gh, jira, so, so2, api], ignore_index=True)

from utils import preprocess_text

finetuning['text'] = finetuning['text'].apply(preprocess_text)
finetuning['label'] = finetuning['label'].apply(str.lower)

finetuning['label'].value_counts()

In [45]:
from sklearn.model_selection import train_test_split

finetuning_train, finetuning_test = train_test_split(finetuning, test_size=0.1, random_state=42)

finetuning_train.to_csv(DATA_PATH / "processed/finetuning_train.csv", index=False)
finetuning_test.to_csv(DATA_PATH / "processed/finetuning_test.csv", index=False)

print(f"Train: \t{len(finetuning_train)}")
print(f"Test: \t{len(finetuning_test)}")

# Evaluation

In [8]:
finetuning_test = pd.read_csv(DATA_PATH / "processed/finetuning_test.csv")
finetuning_test.to_csv(DATA_PATH / "processed/evaluation.csv", index=False)
finetuning_test_only_text = finetuning_test[['text']]
finetuning_test_only_text.to_csv(DATA_PATH / "processed/evaluation_only_text.csv", index=False, header=False)

finetuning_test_only_text.head()

In [10]:
# manually input evaluation_only_text.csv to SentiStrength-SE and save the output to evaluation_ssse_preds.csv

In [13]:
ssse_preds = pd.read_csv(DATA_PATH / "processed/evaluation_ssse_preds.csv", header=None, sep='\t')
ssse_preds.columns = ['text', 'label']

ssse_preds.head()

In [17]:
# on label column keep last space separated value
ssse_preds['label'] = ssse_preds['label'].apply(lambda x: int(x.split()[-1]))
ssse_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
ssse_preds['label'] = ssse_preds['label'].replace(ssse_labels)

ssse_preds.head()

In [18]:
ssse_preds['label'].value_counts()

In [20]:
ssse_preds.to_csv(DATA_PATH / "processed/evaluation_ssse_preds.csv", index=False, header=True, sep=',')

# 20-MAD

In [2]:
DATA_PATH = Path('./data/20-MAD/')
OUTPUT_PATH = Path('./data/processed/20-MAD/')

### Commits

In [5]:
commits = pq.ParquetFile(DATA_PATH / 'commits.parquet').read().to_pandas()
commits.head()

Unnamed: 0,source,repo,hash,parents,author,author_time,author_tz,committer,commit_time,commit_tz,message,added,removed,from_svn,accurate_tz,issue_id
0,apache,accumulo-bsp,02eebbd572b5a3308cb17910c1ad5a368fe3bb5c,3581adc7c5de0dc223eea3604fd6c4327801dd39,58d05bc1bf093c031c0caead30d37b44c48bdaf1,2013-01-15 23:58:38+00:00,0,58d05bc1bf093c031c0caead30d37b44c48bdaf1,2013-01-15 23:58:38+00:00,0,ACCUMULO-532 Update contrib to reflect changes...,329.0,215.0,True,False,12550786.0
1,apache,accumulo-bsp,28339e68f0c666a99f2194b0b725951f7c5a75b4,02eebbd572b5a3308cb17910c1ad5a368fe3bb5c,58d05bc1bf093c031c0caead30d37b44c48bdaf1,2013-01-22 18:08:30+00:00,0,58d05bc1bf093c031c0caead30d37b44c48bdaf1,2013-01-22 18:08:30+00:00,0,ACCUMULO-769 Fix bsp contrib build due to mapr...,8.0,4.0,True,False,12608497.0
2,apache,accumulo-bsp,3581adc7c5de0dc223eea3604fd6c4327801dd39,a4df0d433396a227e8a119c141283e6f0b32c575,83273207ef443072fc57aa858fff07fd5697a4a8,2012-05-17 17:24:30+00:00,0,83273207ef443072fc57aa858fff07fd5697a4a8,2012-05-17 17:24:30+00:00,0,ACCUMULO-532 added IOException to constructor ...,1.0,1.0,True,False,12550786.0
3,apache,accumulo-bsp,817f8732b84fe4bda588492c5bb54f9ea958c74b,e69d32cde486aa5d7a0766b032880e9e13d991b5,83273207ef443072fc57aa858fff07fd5697a4a8,2012-05-16 14:51:58+00:00,0,83273207ef443072fc57aa858fff07fd5697a4a8,2012-05-16 14:51:58+00:00,0,ACCUMULO-593 restructured contrib\n\ngit-svn-i...,3.0,6.0,True,False,12556041.0
4,apache,accumulo-bsp,a4df0d433396a227e8a119c141283e6f0b32c575,fe3ff84ecb2a6fc69f0e05ced668013636c8c591,83273207ef443072fc57aa858fff07fd5697a4a8,2012-05-16 17:42:07+00:00,0,83273207ef443072fc57aa858fff07fd5697a4a8,2012-05-16 17:42:07+00:00,0,ACCUMULO-593 added license header to poms\n\ng...,16.0,0.0,True,False,12556041.0


In [9]:
len(commits)

3439001

In [11]:
text_column = 'message'
non_nullable_columns = [text_column, 'committer', 'repo']
commits.dropna(subset=non_nullable_columns, inplace=True)

len(commits)

3438700

In [14]:
commits[text_column] = commits[text_column].apply(preprocess_text)

In [13]:
from datetime import timedelta

commit_times = pd.to_datetime(commits["commit_time"], utc=True)
author_times = pd.to_datetime(commits["author_time"], utc=True)

# commit_tz_values = commits["commit_tz"]
# author_tz_values = commits["author_tz"]

commits["local_commit_time"] = commit_times + pd.to_timedelta(commits["commit_tz"].apply(lambda x: timedelta(hours=float(x)/100)), unit='h')
commits["local_author_time"] = author_times + pd.to_timedelta(commits["author_tz"].apply(lambda x: timedelta(hours=float(x)/100)), unit='h')

commits["part_of_day_commit"] = pd.cut(commits["local_commit_time"].dt.hour, bins=[0, 6, 12, 18, 23, 24], labels=["Night", "Morning", "Afternoon", "Evening", "Night"], right=False,ordered=False)
commits["part_of_day_author"] = pd.cut(commits["local_author_time"].dt.hour, bins=[0, 6, 12, 18, 23, 24], labels=["Night", "Morning", "Afternoon", "Evening", "Night"], right=False,ordered=False)

In [15]:
commits.sample(5)

Unnamed: 0,source,repo,hash,parents,author,author_time,author_tz,committer,commit_time,commit_tz,message,added,removed,from_svn,accurate_tz,issue_id,local_commit_time,local_author_time,part_of_day_commit,part_of_day_author
2989132,mozilla,gecko-dev,2f9787183c49c171f001609bc9dff99b9e197b03,347e9ff5d2a022670fb621f6a2143c816637d44b,94a1f0c67386c2304aa348ff8a29e520ff99707f,2015-07-13 21:14:04+00:00,-700,94a1f0c67386c2304aa348ff8a29e520ff99707f,2015-07-15 21:30:05+00:00,-700,Bug 1128454 - Fix IPDL test (r=jimm),1.0,1.0,False,True,1128454.0,2015-07-15 14:30:05+00:00,2015-07-13 14:14:04+00:00,Afternoon,Afternoon
2318395,apache,synapse,afe6424e76068ce41e7b80821f8eb561b5a8abe4,f5c8c56d44b26c3c8a8243bcc30033f4228ff98e,2e82046247a7b04081c95dab7240e4a838d8de90,2010-10-28 09:40:42+00:00,0,2e82046247a7b04081c95dab7240e4a838d8de90,2010-10-28 09:40:42+00:00,0,fixing an issue with intialization git-svn-id:...,2.0,0.0,True,False,,2010-10-28 09:40:42+00:00,2010-10-28 09:40:42+00:00,Morning,Morning
1696394,apache,lucene-solr,cef31d62411b6377fdb517ad2d8030c629a1da18,90809f205d302b02d1487c2164d3ee6ab81ff17c,4c82fa3a9289b945723cfe9a9a51ae3e326f97ea,2017-07-26 22:42:17+00:00,-400,4c82fa3a9289b945723cfe9a9a51ae3e326f97ea,2017-07-26 22:42:41+00:00,-400,SOLR-10760: fix another Solrj test,3.0,0.0,False,True,13075295.0,2017-07-26 18:42:41+00:00,2017-07-26 18:42:17+00:00,Evening,Evening
3359294,mozilla,gecko-dev,055a04e54c9bc692c8f256fe1bb2ea59303ba1d8,b0c4e7acce325fc89094e7fd2064e34dd99dd581,89872faee8dde87c44865ce87f7cde84e7f724bd,2001-10-10 14:19:22+00:00,0,89872faee8dde87c44865ce87f7cde84e7f724bd,2001-10-10 14:19:22+00:00,0,unregister event handler when menu goes away. ...,14.0,7.0,False,False,,2001-10-10 14:19:22+00:00,2001-10-10 14:19:22+00:00,Afternoon,Afternoon
661399,apache,cordova-registry-web,261263c02ed7fcec4eb1611f99d15ab7f5381f15,4a27d4f9a51a97b2fcec559b344cfdf04da3e55d,e63a5ba5a1be8408f96ae5b35c09dbf376cad929,2014-10-09 18:54:01+00:00,-500,e63a5ba5a1be8408f96ae5b35c09dbf376cad929,2014-10-09 18:54:01+00:00,-500,updates url as you type,7.0,0.0,False,True,,2014-10-09 13:54:01+00:00,2014-10-09 13:54:01+00:00,Afternoon,Afternoon


In [17]:
commits.to_csv(OUTPUT_PATH / 'commits.csv', mode='w', index=False, header=True)

### Issues

In [22]:
issues = pq.ParquetFile(DATA_PATH / 'issues.parquet').read().to_pandas()
issues.sample(10)

Unnamed: 0,source,product,issue_id,issue_key,created,updated,last_resolved,summary,description,version,...,resolution,component,votes,product_name,reporter_key,reporter_tz,creator_key,creator_tz,assignee_key,assignee_tz
1183527,mozilla,Core,318770,,2005-12-02 19:30:34+00:00,2019-03-13 13:42:05+00:00,2005-12-10 01:19:23+00:00,DOM constructor's toString method should retur...,,Trunk,...,FIXED,DOM: Core & HTML,0,,,,11887050fa5bbe630af637e6a7b6c406933476fd,,11887050fa5bbe630af637e6a7b6c406933476fd,
459992,apache,JCLOUDS,12819673,JCLOUDS-884,2015-04-09 13:46:26+00:00,2015-04-09 14:25:24+00:00,NaT,Do not try to load agentproxies other than net...,"In JCLOUDS-516, we added support for agentprox...",,...,Fixed,jclouds-drivers,0,jclouds,a3887a58d44a7c4a754170807849ee1550f27a64,America/New_York,a3887a58d44a7c4a754170807849ee1550f27a64,America/New_York,a3887a58d44a7c4a754170807849ee1550f27a64,America/New_York
106194,apache,BLUR,12781679,BLUR-413,2015-03-13 01:11:08+00:00,2015-03-16 13:37:45+00:00,NaT,Add a entry count metric to the thrift cache,,,...,Fixed,Blur,0,Apache Blur,b06c036de1b92ee462fda3db5a4e079dcb5c7291,America/New_York,b06c036de1b92ee462fda3db5a4e079dcb5c7291,America/New_York,,
1856840,mozilla,MailNews Core,1542666,,2019-04-08 03:39:53+00:00,2019-09-01 01:09:43+00:00,2019-04-08 23:58:10+00:00,Turn on ESLint in editor,,unspecified,...,FIXED,Composition,0,,,,71d282cc30352b081bde5bb62a1554cfea88a464,,71d282cc30352b081bde5bb62a1554cfea88a464,
743355,apache,SPARK,13211767,SPARK-26727,2019-01-25 09:19:44+00:00,2019-03-05 21:56:47+00:00,NaT,CREATE OR REPLACE VIEW query fails with TableA...,"We experienced that sometimes the Hive query ""...",,...,Not A Bug,SQL,1,Spark,2779b111b04de270a01161d796b3781083ec1070,Asia/Kolkata,2779b111b04de270a01161d796b3781083ec1070,Asia/Kolkata,,
613001,apache,OFBIZ,13236012,OFBIZ-11079,2019-05-28 14:05:10+00:00,2019-05-28 14:05:10+00:00,NaT,BIRT demo data has a dependency on the entity ...,Currently the demo data in birt component is d...,,...,,birt,0,OFBiz,b2fb68926163fe9333b71f2e6ce9334a110b1a1a,Europe/Amsterdam,b2fb68926163fe9333b71f2e6ce9334a110b1a1a,Europe/Amsterdam,,
1009608,mozilla,Core,5999,,1999-05-06 02:01:04+00:00,2000-03-08 22:35:57+00:00,1999-07-14 17:41:20+00:00,{float} Image within table is truncated,,Trunk,...,FIXED,Layout,0,,,,69f9a545cec959fe6a5c459b7a92f52e8aa9e37f,,3f5ea33599688c0ff64e2ceaf90b9ec26771ad1a,
1138253,mozilla,Core,1061349,,2014-09-01 23:24:37+00:00,2014-09-13 17:48:17+00:00,2014-09-13 17:48:17+00:00,regression: poor mathml performance,,31 Branch,...,DUPLICATE,MathML,0,,,,dcbe86e54c57ab781e75e27734ec808125640f19,,1cb4edfe0419379c4621f0bafe59cab9f69832a2,
1818057,mozilla,Infrastructure & Operations,1103443,,2014-11-23 01:26:44+00:00,2016-12-10 02:35:19+00:00,2014-11-23 01:56:39+00:00,Swap on ldap2.db.phx1.mozilla.com is WARNING: ...,,other,...,FIXED,MOC: Problems,0,,,,7e6c2f8df9d101395ce5c16b3521002cc4842ed9,,1cb4edfe0419379c4621f0bafe59cab9f69832a2,
920674,mozilla,Bugzilla,400308,,2007-10-18 19:05:41+00:00,2007-10-18 20:10:23+00:00,2007-10-18 20:10:23+00:00,Same results are displayed when i use search f...,,unspecified,...,INVALID,bugzilla.org,0,,,,44cfb0d1121a2d9ed9814a46fa50b773319587c5,,,


In [24]:
len(issues)

2314127

In [29]:
text_column = 'summary'
non_nullable_columns = [text_column]
issues.dropna(subset=non_nullable_columns, inplace=True)

len(issues)

2314126

In [30]:
issues[text_column] = issues[text_column].apply(preprocess_text)

In [31]:
issues.sample(5)

Unnamed: 0,source,product,issue_id,issue_key,created,updated,last_resolved,summary,description,version,...,resolution,component,votes,product_name,reporter_key,reporter_tz,creator_key,creator_tz,assignee_key,assignee_tz
140758,apache,CASSANDRA,12605619,CASSANDRA-4595,2012-08-30 23:15:45+00:00,2019-04-16 09:32:28+00:00,NaT,Nodetool commands like scrub uses hard coded d...,If your data directory is not /var/lib/cassand...,,...,Invalid,,0,Cassandra,5ebd3be4c6b5320692686b5fdd7571654e99a23b,America/Los_Angeles,5ebd3be4c6b5320692686b5fdd7571654e99a23b,America/Los_Angeles,,
1249890,mozilla,Core,183321,,2002-12-03 22:47:47+00:00,2017-08-10 22:04:39+00:00,2017-08-10 22:04:39+00:00,xpconnect shoult not REQUIRE caps,,Trunk,...,INCOMPLETE,XPConnect,0,,,,608182ffba6af54ee4d69b58f8fa623fbea092c2,,fcafa1b314d37e477457a7698ad27aed097c50e3,
256639,apache,FLEX,12571849,FLEX-10983,2007-06-07 16:39:02+00:00,2011-04-29 10:33:30+00:00,NaT,Overriding methods of a [Managed] class in a s...,When overriding methods of a [Managed] class i...,,...,Fixed,SWC Generation (compc),0,Apache Flex,2bbf27a03aa2a0f6214887912cbade32ca38944b,Etc/UTC,2bbf27a03aa2a0f6214887912cbade32ca38944b,Etc/UTC,2bbf27a03aa2a0f6214887912cbade32ca38944b,Etc/UTC
2063730,mozilla,SeaMonkey,88812,,2001-07-02 15:52:59+00:00,2010-08-01 08:02:57+00:00,2001-07-02 16:07:56+00:00,View-source insist on downloading multipart,,Trunk,...,DUPLICATE,General,0,,,,f77cb1e6c6e911a06bb4f917d6610d8f2b3ca2a9,,46b1fdbdab14c4da2b945f737166724b76a179fa,
1976968,mozilla,Release Engineering,1272786,,2016-05-13 21:10:23+00:00,2016-05-17 20:56:31+00:00,2016-05-17 20:56:31+00:00,Android emulator scripts hard-code /home/cltbl...,,unspecified,...,FIXED,Mozharness,0,,,,9a57f129ffb5ecc6cd6dcf9a1cbe065937e82323,,9a57f129ffb5ecc6cd6dcf9a1cbe065937e82323,


In [32]:
issues.to_csv(OUTPUT_PATH / 'issues.csv', mode='w', index=False, header=True)

### Issue Comments

In [3]:
issue_comments_with_author = pq.ParquetFile(DATA_PATH / 'comments.parquet').read().to_pandas()
issue_comments_with_author.sample(10)

Unnamed: 0,source,product,issue_id,comment_id,count,author_key,author_tz,update_author_key,update_author_tz,created,updated
8185704,mozilla,Core,946469.0,11378696.0,4.0,2e734e8728e4dab84573cb01ea9c9b18e2c55741,,,,2016-05-03 19:05:07+00:00,2016-05-03 19:05:07+00:00
14166026,mozilla,NSS,320497.0,2725714.0,2.0,cec9df59a3d6e0d71742a490d8a03c5b9e52b4e6,,,,2005-12-16 19:21:36+00:00,2005-12-16 19:21:36+00:00
348590,apache,BOOKKEEPER,12540538.0,13226074.0,,5cd91f034abecfb1afdc97871406642f862bcb4a,Europe/Berlin,5cd91f034abecfb1afdc97871406642f862bcb4a,Europe/Berlin,2012-03-09 13:56:10+00:00,2012-03-09 13:56:10+00:00
14692322,mozilla,SeaMonkey,180153.0,1642256.0,6.0,c24e8f0951f8ce64d5002ef13a4eb25e3217ca5b,,,,2002-11-19 17:44:02+00:00,2002-11-19 17:44:02+00:00
3925619,apache,WW,12632063.0,13577387.0,,6a71c343b31339e0c0662f2b62933be182f201cc,Europe/Warsaw,6a71c343b31339e0c0662f2b62933be182f201cc,Europe/Warsaw,2013-02-13 07:18:45+00:00,2013-02-13 07:18:45+00:00
12201298,mozilla,Firefox,1353980.0,12218519.0,22.0,049924122bc41745cbdfb67fc21b7e914c17361d,,,,2017-04-08 18:46:53+00:00,2017-04-08 18:46:53+00:00
971769,apache,FLEX,12565244.0,13309299.0,,2bbf27a03aa2a0f6214887912cbade32ca38944b,Etc/UTC,2bbf27a03aa2a0f6214887912cbade32ca38944b,Etc/UTC,2012-01-25 19:06:02+00:00,2012-01-25 19:06:02+00:00
16048180,mozilla,Thunderbird,533916.0,4443311.0,1.0,13526856c3c2ad61f966ac102d86710a8fbc4bac,,,,2009-12-10 11:39:21+00:00,2009-12-10 11:39:21+00:00
13194455,mozilla,Infrastructure & Operations,857771.0,7273905.0,0.0,ea7058daeb53051f831156e5e7b1bfb0b3da58cf,,,,2013-04-03 21:14:02+00:00,2013-04-03 21:14:02+00:00
16457770,mozilla,Toolkit,797257.0,6692092.0,1.0,5cd7a5f92984f450acbd2a8ee162adbed7adc582,,,,2012-10-03 04:28:01+00:00,2012-10-03 04:28:01+00:00


In [4]:
len(issue_comments_with_author)

17303269

In [14]:
from glob import glob

nlcomment_files = glob(f'{DATA_PATH}/nlp/bugzilla/mozilla/*/*_nlcomments.parquet') + glob(f'{DATA_PATH}/nlp/jira/apache/*/*_nlcomments.parquet')
nlcomment_files.sort()

len(nlcomment_files)

954

In [15]:
text_column = 'text'
non_nullable_columns = [text_column, 'issue_id', 'comment_id']
issue_comments_with_text = pd.DataFrame()

for file in nlcomment_files:
    contents = pq.ParquetFile(file).read().to_pandas()
    if any(col not in contents.columns for col in non_nullable_columns):
        continue
    contents.dropna(subset=non_nullable_columns, inplace=True)
    # merge all rows 'text' values into one string for each 'issue_id', 'comment_id' pair
    contents = contents.groupby(['issue_id', 'comment_id'], as_index=False)['text'].apply(' '.join).reset_index()
    contents[text_column] = contents[text_column].apply(preprocess_text)
    
    issue_comments_with_text = pd.concat([issue_comments_with_text, contents], ignore_index=True)
    
    print(f"current count: {len(issue_comments_with_text)}, last processed: {file}")

issue_comments_with_text.to_csv(OUTPUT_PATH / 'issue_comments_with_text.csv', mode='w', index=False, header=True)

current count: 1657, last processed: data/20-MAD/nlp/bugzilla/mozilla/AUS Graveyard/0_nlcomments.parquet
current count: 2532, last processed: data/20-MAD/nlp/bugzilla/mozilla/Add-on SDK Graveyard/0_nlcomments.parquet
current count: 44168, last processed: data/20-MAD/nlp/bugzilla/mozilla/Add-on SDK/0_nlcomments.parquet
current count: 44296, last processed: data/20-MAD/nlp/bugzilla/mozilla/Air Mozilla Graveyard/0_nlcomments.parquet
current count: 46802, last processed: data/20-MAD/nlp/bugzilla/mozilla/Air Mozilla/0_nlcomments.parquet
current count: 46804, last processed: data/20-MAD/nlp/bugzilla/mozilla/Android Background Services Graveyard/0_nlcomments.parquet
current count: 48510, last processed: data/20-MAD/nlp/bugzilla/mozilla/Android Background Services/0_nlcomments.parquet
current count: 49290, last processed: data/20-MAD/nlp/bugzilla/mozilla/B2GDroid Graveyard/0_nlcomments.parquet
current count: 49415, last processed: data/20-MAD/nlp/bugzilla/mozilla/Boot2Gecko Graveyard/0_nlcomme

In [17]:
issue_comments_with_text.sample(10)

Unnamed: 0,index,issue_id,comment_id,text
11459100,36088,1096105,9583804,Additional link on the events page. What do yo...
12711512,72331,12650393,14549686,I'm thinking the javac warnings (deprecation) ...
4486419,43515,252263,2209792,"Adding testcase to URL field. On Linux, this m..."
9715957,112512,1214783,10841473,Review of attachment 8674187: Assuming you tes...
7427175,33975,636780,5316583,"If the xulrunner extension is the cause, it ca..."
823070,38379,496013,4196187,"If the DLL name is in the local code page, you..."
13896189,73423,13027025,15768029,"Thanks [~ekeller], this looks great! Can we ma..."
8876667,32528,142202,1323840,Mozilla crashes when you try to set the wallpa...
4511892,68988,298387,2532816,Should be able to set offline mode independent...
5536292,83123,1020940,8888661,"* Description: Factory reset,open contact app,..."


In [18]:
issue_comments = pd.merge(issue_comments_with_author, issue_comments_with_text, on=['issue_id', 'comment_id'], how='inner')
issue_comments.to_csv(OUTPUT_PATH / 'issue_comments.csv', mode='w', index=False, header=True)

issue_comments.sample(10)

Unnamed: 0,source,product,issue_id,comment_id,count,author_key,author_tz,update_author_key,update_author_tz,created,updated,index,text
4524812,mozilla,Cloud Services,649495.0,5824302.0,12.0,96d182c77eddae513e371e6f0cce8aff0a314a67,,,,2011-11-03 18:59:58+00:00,2011-11-03 18:59:58+00:00,7703,"Services, what is the status on this bug?"
2859005,apache,OPENEJB,12432392.0,12740632.0,,702ae3b38aab726d9ac965e4ead68a83b7e4e327,America/Los_Angeles,702ae3b38aab726d9ac965e4ead68a83b7e4e327,America/Los_Angeles,2009-08-07 17:00:08+00:00,2009-08-07 17:00:08+00:00,1376,Altered the patch slightly. Put the BufferedIn...
9605619,mozilla,Firefox for iOS,1201875.0,10697094.0,2.0,ca928ddc189e9ad6b80a9a80e56d12041c398e76,,,,2015-09-04 14:54:56+00:00,2015-09-04 14:54:56+00:00,7483,"Desktop: one window has 16 tabs, the other has..."
7760149,mozilla,Core,311615.0,2650679.0,3.0,0c93d6f385011236b08abbfce95a6c1f9bbbf5f0,,,,2005-10-08 14:30:55+00:00,2005-10-08 14:30:55+00:00,4782,When it changes to an inline we'll reconstruct...
5928070,mozilla,Core,772823.0,8377081.0,462.0,15cdb695f141339a4fbcd2eb152c7b89ac6ba9a8,,,,2014-02-05 17:04:37+00:00,2014-02-05 17:04:37+00:00,59509,philor Ubuntu VM 12.04 mozilla-inbound opt tes...
11649161,mozilla,Hello (Loop),1047181.0,9330477.0,16.0,a9988d8db2d2c03ee6862859fd0a51b48ccb2562,,,,2014-09-18 19:36:28+00:00,2014-09-18 19:36:28+00:00,6898,"No need to keep the ""authenticated"" state valu..."
8699902,mozilla,Core,212302.0,2064697.0,8.0,0c93d6f385011236b08abbfce95a6c1f9bbbf5f0,,,,2004-02-04 07:49:55+00:00,2004-02-04 07:49:55+00:00,19481,Sounds like the next line is painting over the...
9435975,mozilla,Firefox for Android,1263110.0,11361026.0,29.0,b4afd141e115be02fe2a51472d06a8d989cfef0a,,,,2016-04-27 13:54:33+00:00,2016-04-27 13:54:33+00:00,92908,Had to re-add the NoMozillaDirectoryException ...
11964660,mozilla,MailNews Core,127631.0,1188131.0,0.0,0e5f27ecf7eca687dde3266866956d196af4e1eb,,,,2002-02-25 08:51:47+00:00,2002-02-25 08:51:47+00:00,27537,version: build 20020204 for Linux on FreeBSD/x...
12521532,mozilla,mozilla.org Graveyard,1011494.0,8802173.0,0.0,3e049bacd26585b544be7ab7edb76e2a6752b4ae,,,,2014-05-16 13:36:07+00:00,2014-05-16 13:36:07+00:00,10427,Automated alert report from nagios1.private.ph...


In [19]:
len(issue_comments)

15439971

Unnamed: 0,source,product,issue_id,comment_id,count,author_key,author_tz,update_author_key,update_author_tz,created,updated,index,text
