In [1]:
from utils import preprocess_text, labelnum

import os
from pathlib import Path
from datetime import timedelta, datetime, timezone

import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

import pandas as pd
import pyarrow.parquet as pq

In [2]:
DATA_PATH = Path('./data/20-MAD/')
OUTPUT_PATH = Path('./data/processed/20-MAD/')

MODEL_PATH = Path('./models/sentiment/')  

In [3]:
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
device = "cuda" if torch.cuda.is_available() else "cpu"

get_sentiment = pipeline('sentiment-analysis',
                    model=model,
                    tokenizer=tokenizer,
                    max_length=512,
                    batch_size=32,
                    truncation=True,
                    device=device
                    )

label_map = {'LABEL_0': 'negative',
             'LABEL_1': 'neutral',
             'LABEL_2': 'positive'}

# Commits

In [4]:
commits_path = DATA_PATH / "commits.parquet"
commits_out_path = OUTPUT_PATH / "commits.csv"

In [17]:
data = pq.ParquetFile(commits_path)

In [18]:
data.metadata

<pyarrow._parquet.FileMetaData object at 0x7e5c2c772250>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 16
  num_rows: 3439001
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 3729

In [19]:
data.schema.names

['source',
 'repo',
 'hash',
 'parents',
 'author',
 'author_time',
 'author_tz',
 'committer',
 'commit_time',
 'commit_tz',
 'message',
 'added',
 'removed',
 'from_svn',
 'accurate_tz',
 'issue_id']

In [20]:
text_column = 'message'
non_nullable_columns = [text_column, 'committer', 'repo']
chunk_size = 4096

In [21]:
if commits_out_path.exists():
    commits_out_path.unlink()

In [None]:
count = 0
for chunk in data.iter_batches(batch_size=chunk_size):
    chunk: pd.DataFrame = chunk.to_pandas()
    chunk = chunk.dropna(subset=[text_column])
    chunk[text_column] = chunk[text_column].apply(preprocess_text)
    
    sentiments = get_sentiment(chunk[text_column].tolist())
    sentiment_labels = [label_map[sentiment['label']] for sentiment in sentiments]
    chunk[f'{text_column}_sentiment'] = sentiment_labels
    
    commit_times = pd.to_datetime(chunk["commit_time"], utc=True)
    author_times = pd.to_datetime(chunk["author_time"], utc=True)

    commit_tz_values = chunk["commit_tz"]
    author_tz_values = chunk["author_tz"]
    
    chunk["local_commit_time"] = commit_times + pd.to_timedelta(chunk["commit_tz"].apply(lambda x: timedelta(hours=float(x)/100)), unit='h')
    chunk["local_author_time"] = author_times + pd.to_timedelta(chunk["author_tz"].apply(lambda x: timedelta(hours=float(x)/100)), unit='h')

    chunk["part_of_day_commit"] = pd.cut(chunk["local_commit_time"].dt.hour, bins=[0, 6, 12, 18, 23, 24], labels=["Night", "Morning", "Afternoon", "Evening", "Night"], right=False,ordered=False)
    chunk["part_of_day_author"] = pd.cut(chunk["local_author_time"].dt.hour, bins=[0, 6, 12, 18, 23, 24], labels=["Night", "Morning", "Afternoon", "Evening", "Night"], right=False,ordered=False)
    
    if commits_out_path.exists():
        chunk.to_csv(commits_out_path, mode='a', index=False, header=False)
    else:
        chunk.to_csv(commits_out_path, mode='w', index=False, header=True)
        
    count += chunk.shape[0]
    print(f"processed {count} commits")