In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd
import torch

In [3]:
DATA_PATH = Path('./data/')
SENTIMENT_MODEL_PATH = Path('./models/sentiment/')  

# Fine-tuning

In [29]:
gh = pd.read_csv(DATA_PATH / 'github_gold.csv', sep=';')
gh.head()

In [30]:
gh = gh[['Text', 'Polarity']]
gh.rename(columns={"Text": "text", 'Polarity': 'label'}, inplace=True)

gh.head()

In [31]:
jira = pd.read_csv(DATA_PATH / 'JIRA.csv')
jira.head()

In [32]:
jira = jira[['sentence', 'oracle']]
jira.rename(columns={"sentence": "text", 'oracle': 'label'}, inplace=True)
jira_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
jira['label'] = jira['label'].replace(jira_labels)

jira.head()

In [33]:
so = pd.read_csv(DATA_PATH / 'StackOverflow.csv')
so.head()

In [34]:
so = so[['text', 'oracle']]
so.rename(columns={'oracle': 'label'}, inplace=True)
so_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
so['label'] = so['label'].replace(so_labels)

so.head()

In [35]:
so2 = pd.read_csv(DATA_PATH / 'NewData.csv')
so2.head()

In [36]:
so2 = so2[['text', 'oracle']]
so2.rename(columns={'oracle': 'label'}, inplace=True)
so2['label'] = so2['label'].apply(str.lower)

so2.head()

In [37]:
api = pd.read_excel(DATA_PATH / 'BenchmarkUddinSO-ConsoliatedAspectSentiment.xls')
api.head()

In [38]:
api = api[['sent', 'ManualLabel']]
api.rename(columns={"sent": "text", 'ManualLabel': 'label'}, inplace=True)
api_labels = {
    'n': 'negative', 
    'o': 'neutral', 
    'p': 'positive' 
}
api['label'] = api['label'].replace(api_labels)

api.head()

In [41]:
finetuning = pd.concat([gh, jira, so, so2, api], ignore_index=True)

from utils import preprocess_text

finetuning['text'] = finetuning['text'].apply(preprocess_text)
finetuning['label'] = finetuning['label'].apply(str.lower)

finetuning['label'].value_counts()

In [45]:
from sklearn.model_selection import train_test_split

finetuning_train, finetuning_test = train_test_split(finetuning, test_size=0.1, random_state=42)

finetuning_train.to_csv(DATA_PATH / "processed/finetuning_train.csv", index=False)
finetuning_test.to_csv(DATA_PATH / "processed/finetuning_test.csv", index=False)

print(f"Train: \t{len(finetuning_train)}")
print(f"Test: \t{len(finetuning_test)}")

# Evaluation

In [8]:
finetuning_test = pd.read_csv(DATA_PATH / "processed/finetuning_test.csv")
finetuning_test.to_csv(DATA_PATH / "processed/evaluation.csv", index=False)
finetuning_test_only_text = finetuning_test[['text']]
finetuning_test_only_text.to_csv(DATA_PATH / "processed/evaluation_only_text.csv", index=False, header=False)

finetuning_test_only_text.head()

In [10]:
# manually input evaluation_only_text.csv to SentiStrength-SE and save the output to evaluation_ssse_preds.csv

In [13]:
ssse_preds = pd.read_csv(DATA_PATH / "processed/evaluation_ssse_preds.csv", header=None, sep='\t')
ssse_preds.columns = ['text', 'label']

ssse_preds.head()

In [17]:
# on label column keep last space separated value
ssse_preds['label'] = ssse_preds['label'].apply(lambda x: int(x.split()[-1]))
ssse_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
ssse_preds['label'] = ssse_preds['label'].replace(ssse_labels)

ssse_preds.head()

In [18]:
ssse_preds['label'].value_counts()

In [20]:
ssse_preds.to_csv(DATA_PATH / "processed/evaluation_ssse_preds.csv", index=False, header=True, sep=',')

# Commits

In [1]:
MODEL_PATH = './models/sentiment/'
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

device = "cuda" if torch.cuda.is_available() else "cpu"
livetest = pipeline('sentiment-analysis',
                    model=model,
                    tokenizer=tokenizer,
                    max_length=512,
                    batch_size=256,
                    truncation=True,
                    device=device
                    )

label_map = {'LABEL_0': 'negative',
             'LABEL_1': 'neutral',
             'LABEL_2': 'positive'}

In [21]:
livetest.model.config.max_position_embeddings

In [22]:
INPUT_PARQUET = "./data/commits.parquet"
SENTIMENT_FIELD = "processed_message"
OUTPUT_CSV = './data/processed/commits.csv'
CHUNK_SIZE = 4096

In [23]:
data = pq.ParquetFile(INPUT_PARQUET)

In [24]:
data.metadata

In [25]:
data.schema

In [None]:
# INPUT_PARQUET = "./data/commits.parquet"
# SENTIMENT_FIELD = "processed_message"
# OUTPUT_CSV = './data/commit_sentiments.csv'
# CHUNK_SIZE = 4096
# 
# data = pq.ParquetFile(INPUT_PARQUET)
# data.metadata
# data.schema
# 
# import time
# for chunk in data.iter_batches(batch_size=CHUNK_SIZE):
#     chunk: pd.DataFrame = chunk.to_pandas()
#     sentiments = livetest(commit_messages[i:i+chunk_size])
#     
# start_index = 0
# save_path = Path()
# for i in range(start_index, len(commit_messages), chunk_size):
#     
#     pred_labels = [label_map[pred['label']] for pred in sentiments]
#     pred_scores = [pred['score'] for pred in sentiments]
#     pred_df = pd.DataFrame({'predicted_sentiment': pred_labels, 'predicted_score': pred_scores})
#     pred_df.index += i
#     if save_path.exists():
#         pred_df.to_csv(save_path, header=False, mode='a')
#     else:
#         pred_df.to_csv(save_path, header=True, mode='w')
#     print(f"processed commits: {i+chunk_size}")
#     
#     
