In [1]:
from utils import preprocess_text, labelnum

import os
from glob import glob
from pathlib import Path
from datetime import timedelta, datetime, timezone

import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

import pandas as pd
import pyarrow.parquet as pq

In [2]:
DATA_PATH = Path('./data/processed/20-MAD/')
OUTPUT_PATH = Path('./data/processed/20-MAD/nlp')

MODEL_PATH = Path('./models/sentiment/')

In [3]:
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
device = "cuda" if torch.cuda.is_available() else "cpu"

get_sentiment = pipeline('sentiment-analysis',
                         model=model,
                         tokenizer=tokenizer,
                         max_length=512,
                         batch_size=128,
                         truncation=True,
                         device=device
                         )

label_map = {'LABEL_0': 'negative',
             'LABEL_1': 'neutral',
             'LABEL_2': 'positive'}

# Commits

In [10]:
commits_path = DATA_PATH / "commits.parquet"
commits_out_path = OUTPUT_PATH / "commits.csv"

In [17]:
data = pq.ParquetFile(commits_path)

In [18]:
data.metadata

<pyarrow._parquet.FileMetaData object at 0x7e5c2c772250>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 16
  num_rows: 3439001
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 3729

In [19]:
data.schema.names

['source',
 'repo',
 'hash',
 'parents',
 'author',
 'author_time',
 'author_tz',
 'committer',
 'commit_time',
 'commit_tz',
 'message',
 'added',
 'removed',
 'from_svn',
 'accurate_tz',
 'issue_id']

In [20]:
text_column = 'message'
non_nullable_columns = [text_column, 'committer', 'repo']
chunk_size = 4096

In [21]:
# if commits_out_path.exists():
#     commits_out_path.unlink()

In [22]:
count = 0
for chunk in data.iter_batches(batch_size=chunk_size):
    chunk: pd.DataFrame = chunk.to_pandas()
    chunk = chunk.dropna(subset=[text_column])
    chunk[text_column] = chunk[text_column].apply(preprocess_text)

    sentiments = get_sentiment(chunk[text_column].tolist())
    sentiment_labels = [label_map[sentiment['label']] for sentiment in sentiments]
    chunk[f'{text_column}_sentiment'] = sentiment_labels

    commit_times = pd.to_datetime(chunk["commit_time"], utc=True)
    author_times = pd.to_datetime(chunk["author_time"], utc=True)

    commit_tz_values = chunk["commit_tz"]
    author_tz_values = chunk["author_tz"]

    chunk["local_commit_time"] = commit_times + pd.to_timedelta(
        chunk["commit_tz"].apply(lambda x: timedelta(hours=float(x) / 100)), unit='h')
    chunk["local_author_time"] = author_times + pd.to_timedelta(
        chunk["author_tz"].apply(lambda x: timedelta(hours=float(x) / 100)), unit='h')

    chunk["part_of_day_commit"] = pd.cut(chunk["local_commit_time"].dt.hour, bins=[0, 6, 12, 18, 23, 24],
                                         labels=["Night", "Morning", "Afternoon", "Evening", "Night"], right=False,
                                         ordered=False)
    chunk["part_of_day_author"] = pd.cut(chunk["local_author_time"].dt.hour, bins=[0, 6, 12, 18, 23, 24],
                                         labels=["Night", "Morning", "Afternoon", "Evening", "Night"], right=False,
                                         ordered=False)

    if commits_out_path.exists():
        chunk.to_csv(commits_out_path, mode='a', index=False, header=False)
    else:
        chunk.to_csv(commits_out_path, mode='w', index=False, header=True)

    count += chunk.shape[0]
    print(f"processed {count} commits")

processed 4096 commits
processed 8192 commits


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


processed 12288 commits
processed 16384 commits
processed 20480 commits
processed 24576 commits
processed 28672 commits
processed 32764 commits
processed 36860 commits
processed 40956 commits
processed 45052 commits
processed 49148 commits
processed 53244 commits
processed 57340 commits
processed 61436 commits
processed 65532 commits
processed 69628 commits
processed 73724 commits
processed 77820 commits
processed 81916 commits
processed 86012 commits
processed 90108 commits
processed 94204 commits
processed 98300 commits
processed 102396 commits
processed 106492 commits
processed 110588 commits
processed 114684 commits
processed 118780 commits
processed 122876 commits
processed 126972 commits
processed 131068 commits
processed 135164 commits
processed 139260 commits
processed 143356 commits
processed 147452 commits
processed 151548 commits
processed 155644 commits
processed 159740 commits
processed 163836 commits
processed 167932 commits
processed 172028 commits
processed 176124 commi

In [11]:
commits_out = pd.read_csv(commits_out_path)

In [12]:
commits_out['message_sentiment'].value_counts()

message_sentiment
neutral     3147283
negative     211839
positive      79578
Name: count, dtype: int64

In [13]:
commits_out[commits_out['message_sentiment'] == 'negative'].sample(10)[['message', 'message_sentiment']]

Unnamed: 0,message,message_sentiment
659871,CB-10228:(iOS) AppendUserAgent not working wit...,negative
2757416,Backed out changeset 7c0438df6767,negative
96392,AMBARI-22523. Service config changes on hdp + ...,negative
1631236,LOG4J2-1103 - FailoverAppender was failing wit...,negative
2009384,PHOENIX-4872: BulkLoad has bug when loading on...,negative
1192312,[EPMCDLAB-1072]: fixed issue with configuring ...,negative
814532,Fix UT problems after structure change,negative
1178761,fix wrong name in bvar_c++.md,negative
2773460,Backed out changeset 759b21bbb64b (bug 610223)...,negative
813799,Fix configurator bug,negative


In [14]:
commits_out[commits_out['message_sentiment'] == 'positive'].sample(10)[['message', 'message_sentiment']]

Unnamed: 0,message,message_sentiment
329417,Merge pull request #24 from cloudsoft/openshif...,positive
2309590,"i promise - the last name change, but steve co...",positive
1898494,NIFI-4429 Added GetMongoAggregation to support...,positive
2201598,SLING-3501 - allow for combining Health Checks...,positive
1789926,Added a test for /roles quota.consumed field. ...,positive
585457,Moving back :) git-svn-id: https://svn.apache....,positive
2534367,New set of passing tests after patch.,positive
2955134,Patch by Ian Wells <I.Wells@tarragon-et.co.uk>...,positive
1469000,JAMES-2393 Introduce dedicated modules for eve...,positive
3063786,"fixed VerifyReflow, an important debugging too...",positive


In [15]:
commits_out[commits_out['message_sentiment'] == 'neutral'].sample(10)[['message', 'message_sentiment']]

Unnamed: 0,message,message_sentiment
971152,HDFS-3873. Hftp assumes security is disabled i...,neutral
2204442,SLING-555 : Update all poms to use the latest ...,neutral
3040810,Bug 1284947 - Update tab's permanentKey in _sw...,neutral
1858585,No jira ticket: allow for synthesized 32.768 c...,neutral
198393,Creating a 0.2 release branch of aries trunk g...,neutral
390196,CAMEL-13445 - Camel-Pulsar: Skip tests if no d...,neutral
1762675,Submitted by: Lucio Benfante Add italian trans...,neutral
3179775,Bug 1473671 - Don't store persistent block per...,neutral
2633970,fix copying imap/news msgs while offline witho...,neutral
525591,bug 12893: cloudstack 3.0 new UI - Add Primary...,neutral


# Issues

In [5]:
issues_path = DATA_PATH / "issues.parquet"
issues_out_path = OUTPUT_PATH / "issues.csv"

In [24]:
data = pq.ParquetFile(issues_path)
data.metadata

<pyarrow._parquet.FileMetaData object at 0x6ffbe2a58630>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 25
  num_rows: 2314127
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 4304

In [25]:
data.schema.names

['source',
 'product',
 'issue_id',
 'issue_key',
 'created',
 'updated',
 'last_resolved',
 'summary',
 'description',
 'version',
 'milestone',
 'status',
 'severity',
 'priority',
 'issuetype',
 'resolution',
 'component',
 'votes',
 'product_name',
 'reporter_key',
 'reporter_tz',
 'creator_key',
 'creator_tz',
 'assignee_key',
 'assignee_tz']

In [26]:
text_column = 'summary'
non_nullable_columns = [text_column]
chunk_size = 4096

In [27]:
if issues_out_path.exists():
    issues_out_path.unlink()
    print(f"removed {issues_out_path}")

In [28]:
count = 0
for chunk in data.iter_batches(batch_size=chunk_size):
    chunk: pd.DataFrame = chunk.to_pandas()
    chunk = chunk.dropna(subset=[text_column])
    chunk[text_column] = chunk[text_column].apply(preprocess_text)

    sentiments = get_sentiment(chunk[text_column].tolist())
    sentiment_labels = [label_map[sentiment['label']] for sentiment in sentiments]
    chunk[f'{text_column}_sentiment'] = sentiment_labels

    if issues_out_path.exists():
        chunk.to_csv(issues_out_path, mode='a', index=False, header=False)
    else:
        chunk.to_csv(issues_out_path, mode='w', index=False, header=True)

    count += chunk.shape[0]
    print(f"{datetime.now().time()}: processed {count} issues")

06:17:26.540803: processed 4096 issues
06:17:39.387911: processed 8192 issues
06:17:52.715213: processed 12288 issues
06:18:05.961012: processed 16384 issues
06:18:18.978455: processed 20480 issues
06:18:32.638142: processed 24576 issues
06:18:47.873200: processed 28672 issues
06:19:05.838196: processed 32768 issues
06:19:22.807455: processed 36864 issues
06:19:37.669808: processed 40960 issues
06:19:54.123899: processed 45056 issues
06:20:11.184934: processed 49152 issues
06:20:23.496473: processed 53248 issues
06:20:36.623456: processed 57344 issues
06:20:49.586444: processed 61440 issues
06:21:04.193150: processed 65536 issues
06:21:18.092787: processed 69632 issues
06:21:32.532663: processed 73728 issues
06:21:45.238583: processed 77824 issues
06:22:00.654392: processed 81920 issues
06:22:18.480561: processed 86016 issues
06:22:34.823727: processed 90112 issues
06:22:48.333738: processed 94208 issues
06:23:02.759447: processed 98304 issues
06:23:17.617201: processed 102400 issues
0

In [7]:
issues_out = pd.read_csv(issues_out_path)

  issues_out = pd.read_csv(issues_out_path)


In [8]:
issues_out['summary_sentiment'].value_counts()

summary_sentiment
neutral     1487112
negative     812020
positive      14994
Name: count, dtype: int64

In [9]:
issues_out[issues_out['summary_sentiment'] == 'negative'].sample(10)[['summary', 'summary_sentiment']]

Unnamed: 0,summary,summary_sentiment
814707,Servlet exception with myfaces-1.2.0 and trini...,negative
1582989,[Flame] [v2.2] keyboard input into the browser...,negative
301236,CI Failure: ConnectCommandWithSSLTest.connectW...,negative
815266,required inputFile (re-submitted) causes valid...,negative
305043,DistributedAckOverflowRegionCCEOffHeapDUnitTes...,negative
21562,Ambari-agent start fails if there is no dns re...,negative
1365266,Transparent GIFs sometimes turn black (bad ATI...,negative
1067744,[Skia] Drawing an RGB565 bitmap to another RGB...,negative
1655787,[e10s] “Integrated Inbox for Gmail & Google Ap...,negative
365117,RegionServer Stop by ArrayIndexOutOfBoundsExce...,negative


# Issue Comments

In [4]:
issue_comments_path = DATA_PATH / "issue_comments.csv"
issue_comments_out_path = OUTPUT_PATH / "issue_comments.csv"

text_column = 'text'

In [5]:
# if issue_comments_out_path.exists():
#     issue_comments_out_path.unlink()
#     print(f"removed {issue_comments_out_path}")

In [6]:
chunk_size = 8192
start_chunk = 303

current_chunk = start_chunk
count = start_chunk * chunk_size
with pd.read_csv(issue_comments_path, chunksize=chunk_size, skiprows=range(1, start_chunk * chunk_size)) as reader:
    for chunk in reader:
        # display(chunk.head())
        # break
        chunk = chunk.dropna(subset=[text_column])
        chunk[text_column] = chunk[text_column].apply(preprocess_text)
        sentiments = get_sentiment(chunk[text_column].tolist())
        sentiment_labels = [label_map[sentiment['label']] for sentiment in sentiments]
        chunk[f'{text_column}_sentiment'] = sentiment_labels

        if issue_comments_out_path.exists():
            chunk.to_csv(issue_comments_out_path, mode='a', index=False, header=False)
        else:
            chunk.to_csv(issue_comments_out_path, mode='w', index=False, header=True)

        count += chunk.shape[0]
        current_chunk += 1
        print(f"{datetime.now().isoformat(sep=' ', timespec='seconds')}: processed {count} issue comments, next chunk is {current_chunk}")

2024-05-31 02:33:42: processed 8192 issue comments, next chunk is 1
2024-05-31 02:39:28: processed 16384 issue comments, next chunk is 2
2024-05-31 02:46:39: processed 24573 issue comments, next chunk is 3
2024-05-31 02:52:00: processed 32765 issue comments, next chunk is 4
2024-05-31 02:58:36: processed 40954 issue comments, next chunk is 5
2024-05-31 03:07:09: processed 49144 issue comments, next chunk is 6
2024-05-31 03:13:41: processed 57334 issue comments, next chunk is 7
2024-05-31 03:17:59: processed 65526 issue comments, next chunk is 8
2024-05-31 03:22:04: processed 73718 issue comments, next chunk is 9


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


2024-05-31 03:26:28: processed 81910 issue comments, next chunk is 10
2024-05-31 03:31:08: processed 90102 issue comments, next chunk is 11
2024-05-31 03:36:44: processed 98294 issue comments, next chunk is 12
2024-05-31 03:41:29: processed 106486 issue comments, next chunk is 13
2024-05-31 03:46:00: processed 114678 issue comments, next chunk is 14
2024-05-31 03:51:50: processed 122868 issue comments, next chunk is 15
2024-05-31 03:58:04: processed 131060 issue comments, next chunk is 16
2024-05-31 04:03:36: processed 139252 issue comments, next chunk is 17
2024-05-31 04:09:26: processed 147442 issue comments, next chunk is 18
2024-05-31 04:15:00: processed 155634 issue comments, next chunk is 19
2024-05-31 04:21:28: processed 163825 issue comments, next chunk is 20
2024-05-31 04:27:54: processed 172017 issue comments, next chunk is 21
2024-05-31 04:35:01: processed 180209 issue comments, next chunk is 22
2024-05-31 04:40:11: processed 188397 issue comments, next chunk is 23
2024-05-3

KeyboardInterrupt: 

In [10]:
len(data)

15439971

In [6]:
data = pd.read_csv(issue_comments_out_path)

In [9]:
data.sample(10)

Unnamed: 0,source,product,issue_id,comment_id,count,author_key,author_tz,update_author_key,update_author_tz,created,updated,index,text,text_sentiment
1912480,apache,HIVE,13020602.0,15685151.0,,0d646c1142b8ee7bc5e3cbcf37da41981f26b79b,America/Chicago,0d646c1142b8ee7bc5e3cbcf37da41981f26b79b,America/Chicago,2016-11-21 23:53:31+00:00,2016-11-21 23:53:31+00:00,38646,Attach a new patch that do not call {{destFs.e...,positive
2008637,apache,IGNITE,12921734.0,15058135.0,,6811159e4c1fd2fde666f659c1c5ee4d7dc7190d,Europe/Moscow,6811159e4c1fd2fde666f659c1c5ee4d7dc7190d,Europe/Moscow,2015-12-15 14:34:55+00:00,2015-12-15 14:34:55+00:00,5919,"Vladimir, We are using Visual Studio 2010 tool...",neutral
1283894,apache,HADOOP,13095494.0,16455737.0,,bfa41b178b2b81442a25fa1a6f0381c3a60ef31f,Etc/UTC,bfa41b178b2b81442a25fa1a6f0381c3a60ef31f,Etc/UTC,2018-04-27 04:16:07+00:00,2018-04-27 04:16:07+00:00,38033,| {color:red}-1{color} | {color:red} test4test...,neutral
915205,apache,FLEX,12563572.0,13304277.0,,2bbf27a03aa2a0f6214887912cbade32ca38944b,Etc/UTC,2bbf27a03aa2a0f6214887912cbade32ca38944b,Etc/UTC,2012-01-25 16:48:46+00:00,2012-01-25 16:48:46+00:00,3640,Original Reporter: jszeto Original Resolution:...,neutral
961733,apache,FLINK,12910701.0,14996842.0,,ce6e0bab7e887c4715d54193d747b59e2fab9978,Europe/Amsterdam,ce6e0bab7e887c4715d54193d747b59e2fab9978,Europe/Amsterdam,2015-11-09 16:41:02+00:00,2015-11-09 16:41:02+00:00,20808,I just started a simple test Flink application...,neutral
2005768,apache,IGNITE,12930145.0,15130811.0,,36036bb62b5701e5f15b02f8df6a48c78d9565f9,Europe/Moscow,36036bb62b5701e5f15b02f8df6a48c78d9565f9,Europe/Moscow,2016-02-03 18:22:53+00:00,2016-02-03 18:22:53+00:00,6573,"1) Since we have a way to serialize config, it...",neutral
2478781,apache,MAPREDUCE,12612987.0,13483330.0,,35e7deeb6060d289454099d6942c26d4d08c89e8,America/Chicago,35e7deeb6060d289454099d6942c26d4d08c89e8,America/Chicago,2012-10-24 15:51:33+00:00,2012-10-24 15:51:33+00:00,42971,"Thanks, Vinod. I committed this to trunk, bran...",positive
704820,apache,DERBY,12361033.0,12477837.0,,6d8b3c1274b29f5a931c4abed1ff72798bb8441c,Europe/Berlin,6d8b3c1274b29f5a931c4abed1ff72798bb8441c,Europe/Berlin,2007-03-04 18:52:36+00:00,2007-03-04 18:52:36+00:00,18719,The new error message for an unauthorized shut...,negative
1300023,apache,HADOOP,13238244.0,16861573.0,,05751893618b1c0d657a10873cc25973089aa8df,America/Los_Angeles,879acbd13677c9a4d52756fb70595213202ce769,America/Los_Angeles,2019-06-11 22:43:09+00:00,2019-10-23 16:14:57+00:00,51244,Thank you [~prabhujoseph] for the patch. I jus...,positive
1444079,apache,HBASE,12668756.0,13781363.0,,19227cdf85cdf92b72428d9b751456671a68021b,Etc/UTC,19227cdf85cdf92b72428d9b751456671a68021b,Etc/UTC,2013-09-29 13:23:36+00:00,2013-09-29 13:23:36+00:00,79252,Thanks for [~jmspaggi]'s careful and patient r...,positive
