In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd
import torch

In [20]:
MODEL_PATH = './models/sentiment/'
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

device = "cuda" if torch.cuda.is_available() else "cpu"
livetest = pipeline('sentiment-analysis',
                    model=model,
                    tokenizer=tokenizer,
                    max_length=512,
                    batch_size=256,
                    truncation=True,
                    device=device
                    )

label_map = {'LABEL_0': 'negative',
             'LABEL_1': 'neutral',
             'LABEL_2': 'positive'}

In [21]:
livetest.model.config.max_position_embeddings

512

In [22]:
INPUT_PARQUET = "./data/commits.parquet"
SENTIMENT_FIELD = "processed_message"
OUTPUT_CSV = './data/processed/commits.csv'
CHUNK_SIZE = 4096

In [23]:
data = pq.ParquetFile(INPUT_PARQUET)

In [24]:
data.metadata

<pyarrow._parquet.FileMetaData object at 0x7625cc9f6bb0>
  created_by: parquet-cpp-arrow version 11.0.0
  num_columns: 22
  num_rows: 3439001
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 14080

In [25]:
data.schema

<pyarrow._parquet.ParquetSchema object at 0x7625bfd28740>
required group field_id=-1 schema {
  optional binary field_id=-1 source (String);
  optional binary field_id=-1 repo (String);
  optional binary field_id=-1 hash (String);
  optional binary field_id=-1 parents (String);
  optional binary field_id=-1 author (String);
  optional int64 field_id=-1 author_time (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional binary field_id=-1 author_tz (String);
  optional int64 field_id=-1 local_author_time (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional binary field_id=-1 part_of_day_author (String);
  optional binary field_id=-1 committer (String);
  optional int64 field_id=-1 commit_time (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional binary field_id

16

In [None]:
# INPUT_PARQUET = "./data/commits.parquet"
# SENTIMENT_FIELD = "processed_message"
# OUTPUT_CSV = './data/commit_sentiments.csv'
# CHUNK_SIZE = 4096
# 
# data = pq.ParquetFile(INPUT_PARQUET)
# data.metadata
# data.schema
# 
# import time
# for chunk in data.iter_batches(batch_size=CHUNK_SIZE):
#     chunk: pd.DataFrame = chunk.to_pandas()
#     sentiments = livetest(commit_messages[i:i+chunk_size])
#     
# start_index = 0
# save_path = Path()
# for i in range(start_index, len(commit_messages), chunk_size):
#     
#     pred_labels = [label_map[pred['label']] for pred in sentiments]
#     pred_scores = [pred['score'] for pred in sentiments]
#     pred_df = pd.DataFrame({'predicted_sentiment': pred_labels, 'predicted_score': pred_scores})
#     pred_df.index += i
#     if save_path.exists():
#         pred_df.to_csv(save_path, header=False, mode='a')
#     else:
#         pred_df.to_csv(save_path, header=True, mode='w')
#     print(f"processed commits: {i+chunk_size}")
#     
#     
