In [1]:
from utils import preprocess_text, labelnum

import os
from glob import glob
from pathlib import Path
from datetime import timedelta, datetime, timezone

import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

import pandas as pd
import pyarrow.parquet as pq

In [2]:
MODEL_PATH = Path('./models/sentiment/')

model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
device = "cuda" if torch.cuda.is_available() else "cpu"

get_sentiment = pipeline('sentiment-analysis',
                         model=model,
                         tokenizer=tokenizer,
                         max_length=512,
                         batch_size=128,
                         truncation=True,
                         device=device
                         )

label_map = {'LABEL_0': 'negative',
             'LABEL_1': 'neutral',
             'LABEL_2': 'positive'}

# Issue comments (Only Jira)


In [3]:
issues = pq.ParquetFile('./data/20-MAD/issues.parquet').read().to_pandas()

issues.columns

Index(['source', 'product', 'issue_id', 'issue_key', 'created', 'updated',
       'last_resolved', 'summary', 'description', 'version', 'milestone',
       'status', 'severity', 'priority', 'issuetype', 'resolution',
       'component', 'votes', 'product_name', 'reporter_key', 'reporter_tz',
       'creator_key', 'creator_tz', 'assignee_key', 'assignee_tz'],
      dtype='object')

In [4]:
columns_to_drop = ['summary', 'description'] 
issues.drop(columns=columns_to_drop, inplace=True)

In [5]:
issues = issues[issues['source'] == 'apache']

len(issues)

883065

In [6]:
non_nullable_issue_columns = ["product", "issue_id", "priority", "issuetype"] 
issues = issues.dropna(subset=non_nullable_issue_columns)

len(issues)

870471

In [7]:
issues['priority'].value_counts()


priority
Major       582743
Minor       177287
Critical     39798
Blocker      28303
Trivial      26907
Normal        8359
Low           6473
Urgent         590
High            11
Name: count, dtype: int64

In [8]:
expected_issue_priorities = ['Blocker', 'Critical', 'Major', 'Minor', 'Trivial']
issues = issues[issues['priority'].isin(expected_issue_priorities)]

len(issues)

855038

In [11]:
# save modified issues
issues.to_parquet('./data/processed/20-MAD/rq/issues_for_rq_3_4.parquet', index=False)

# merge issue comments

In [13]:
issue_comment_metadata = pq.ParquetFile('./data/processed/20-MAD/rq/issues_for_rq_3_4.parquet').read().to_pandas()

In [14]:
from glob import glob

nlcomment_files = glob('./data/20-MAD//nlp/jira/apache/*/*_nlcomments.parquet')
nlcomment_files.sort()

len(nlcomment_files)

653

In [15]:
sample = pq.ParquetFile(nlcomment_files[0]).read().to_pandas()
sample.head()

Unnamed: 0,source,product,issue_id,comment_id,paragraph_id,text,nchar
0,apache,AAR,12963270,15261151,1,"Watch The Ultimate Fighter Season 23, Episode ...",61
1,apache,AAR,12963270,15261151,2,"Watch The Ultimate Fighter Season 23, Episode ...",61
2,apache,AAR,12963270,15261151,3,"Watch The Ultimate Fighter Season 23, Episode ...",61
3,apache,AAR,12963270,15261151,4,"Watch The Ultimate Fighter Season 23, Episode ...",61
4,apache,AAR,12963270,15261151,5,"Watch The Ultimate Fighter Season 23, Episode ...",61


In [16]:
schema = pq.ParquetFile(nlcomment_files[0]).schema_arrow
with pq.ParquetWriter("./data/processed/20-MAD/rq/issue_comments_without_metadata.parquet", schema=schema) as writer:
    for file in nlcomment_files:
        writer.write_table(pq.read_table(file, schema=schema))

In [17]:
issue_comments_without_metadata = pq.ParquetFile('./data/processed/20-MAD/rq/issue_comments_without_metadata.parquet').read().to_pandas()

In [19]:
issue_comments_without_metadata.columns

Index(['source', 'product', 'issue_id', 'comment_id', 'paragraph_id', 'text',
       'nchar'],
      dtype='object')

In [20]:
text_column = 'text'
non_nullable_columns = [text_column, 'product', 'issue_id', 'comment_id', 'paragraph_id']
issue_comments_without_metadata.dropna(subset=non_nullable_columns, inplace=True)

len(issue_comments_without_metadata)

16861861

In [22]:
issue_comments_without_metadata["source"].value_counts()

source
apache    16861861
Name: count, dtype: int64

In [23]:
issue_comments_without_metadata_text_merged = issue_comments_without_metadata.groupby(['product', 'issue_id', 'comment_id'], as_index=False)['text'].apply(' '.join).reset_index()


AttributeError: 'DataFrame' object has no attribute 'write_parquet'

In [24]:
issue_comments_without_metadata_text_merged.to_parquet('./data/processed/20-MAD/rq/issue_comments_without_metadata_text_merged.parquet', index=False)

In [25]:
len(issue_comments_without_metadata_text_merged)

3957479

In [27]:
# merge with issue metadata
issue_comments_merged = issue_comments_without_metadata_text_merged.merge(issue_comment_metadata, on=['product', 'issue_id'], how='inner')
issue_comments_merged.to_parquet('./data/processed/20-MAD/rq/issue_comments_merged.parquet', index=False)

In [28]:
len(issue_comments_merged)

3814162

In [34]:
issue_comments_merged["priority"].value_counts()

priority
Major       2608952
Minor        688099
Critical     239380
Blocker      185884
Trivial       91847
Name: count, dtype: int64

In [31]:
issue_comments_merged["issuetype"].value_counts()

issuetype
Bug                                      1876548
Improvement                               938443
Sub-task                                  386933
New Feature                               319914
Task                                      183403
Test                                       37420
Wish                                       25608
Question                                    7333
Documentation                               5858
Dependency upgrade                          5745
Story                                       5082
Umbrella                                    4698
Technical task                              2755
Project                                     2462
Planned Work                                2455
Epic                                        2326
Brainstorming                               1645
Github Integration                           982
SVN->GIT Migration                           817
New Git Repo                                 775
New JIRA P

In [33]:
issue_comments_merged.columns

Index(['index', 'product', 'issue_id', 'comment_id', 'text', 'source',
       'issue_key', 'created', 'updated', 'last_resolved', 'version',
       'milestone', 'status', 'severity', 'priority', 'issuetype',
       'resolution', 'component', 'votes', 'product_name', 'reporter_key',
       'reporter_tz', 'creator_key', 'creator_tz', 'assignee_key',
       'assignee_tz'],
      dtype='object')

In [None]:
# we need [product,issue_id,comment_id,priority,issue_type,text_sentiment]

In [38]:
text_column = 'text'
columns_of_interest = [text_column, 'product', 'issue_id', 'comment_id', 'source', 'priority', 'issuetype']

issue_comments_merged = issue_comments_merged[columns_of_interest]
issue_comments_merged[text_column] = issue_comments_merged[text_column].apply(preprocess_text)
issue_comments_merged.to_parquet('./data/processed/20-MAD/rq/issue_comments.parquet', index=False)

# Sentiment Classification

In [3]:
text_column = 'text'
common_columns = ['product', 'issue_id', 'comment_id', 'priority', 'issuetype']
inout_columns_of_interest = [text_column] + common_columns
output_columns_of_interest = [f'{text_column}_sentiment'] + common_columns

In [4]:
DATA_PATH = Path('./data/processed/20-MAD/rq/issue_comments.parquet')
OUTPUT_PATH = Path('./data/processed/20-MAD/rq/issue_comments_with_sentiment.csv')
PREVIOUSLY_PROCESSED_PATHS = [OUTPUT_PATH]

In [5]:
previously_processed = pd.DataFrame()
for path_ in PREVIOUSLY_PROCESSED_PATHS:
    if path_.exists():
        # read the previously processed data and append it to the previously_processed dataframe
        previous_output = pd.read_csv(path_)
        previously_processed = pd.concat([previously_processed, previous_output])
previously_processed.drop_duplicates(inplace=True)

len(previously_processed)

1376256

In [6]:
data = pq.ParquetFile(DATA_PATH).read().to_pandas()
data = data[~(data.filter(common_columns).isin(previously_processed.filter(common_columns)).all(axis=1))]

len(data)

2437906

In [7]:
# if OUTPUT_PATH.exists():
#     -------------OUTPUT_PATH.unlink()
#     print(f"removed {OUTPUT_PATH}")

In [8]:
count = 0
chunk_size = 8192

chunks = [data[i:i + chunk_size] for i in range(0, data.shape[0], chunk_size)]

In [9]:
from pandas.errors import SettingWithCopyWarning
import warnings

warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))

In [None]:
for i, chunk in enumerate(chunks):
    sentiments = get_sentiment(chunk[text_column].tolist())
    sentiment_labels = [label_map[sentiment['label']] for sentiment in sentiments]
    chunk[f'{text_column}_sentiment'] = sentiment_labels
    # chunk.drop(columns=[text_column], inplace=True)

    if OUTPUT_PATH.exists():
        chunk.to_csv(OUTPUT_PATH, mode='a', columns=output_columns_of_interest, index=False, header=False)
    else:
        chunk.to_csv(OUTPUT_PATH, mode='w', columns=output_columns_of_interest, index=False, header=True)

    count += chunk.shape[0]
    print(f"{datetime.now().isoformat(sep=' ', timespec='seconds')}: processed {count} issue comments")