In [None]:
import json
import re
from tqdm.notebook import tqdm
from copy import deepcopy
import pandas as pd

In [None]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)

    return j


# json 개체를 파일이름으로 깔끔하게 저장
def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

        
# jsonl 파일 읽어서 list에 저장
def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

In [None]:
train_file_path = "./Original_data/nikluge-sa-2022-train.jsonl"
dev_file_path = "./Original_data/nikluge-sa-2022-dev.jsonl"

train_data = jsonlload(train_file_path)
dev_data = jsonlload(dev_file_path)

In [None]:
train_data.extend(dev_data)

In [None]:
train_data

In [None]:
print(len(train_data))

In [None]:
train_data

In [None]:
polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_count = {polarity_id_to_name[i]: 0 for i in range(len(polarity_id_to_name))}

In [None]:
for pair in train_data:
    for detail in pair["annotation"]:
        for polarity in polarity_id_to_name:
            if polarity == detail[2]:
                polarity_count[polarity] += 1

In [None]:
polarity_count

In [None]:
double_index = []

for i,pair in enumerate(train_data):
    if len(pair["annotation"]) > 1:
        double_index.append(i)

In [None]:
single_index = []

for i,pair in enumerate(train_data):
    if len(pair["annotation"]) == 1:
        single_index.append(i)

In [None]:
print(len(double_index))

In [None]:
print(len(single_index))

In [None]:
print(len(train_data))

In [None]:
single_index_train_data = deepcopy(train_data)
double_index.sort(reverse=True)
for i in double_index:
    del single_index_train_data[i]

In [None]:
double_index_train_data = deepcopy(train_data)
single_index.sort(reverse=True)
for i in single_index:
    del double_index_train_data[i]

In [None]:
print(len(single_index_train_data))

In [None]:
single_index_train_data

In [None]:
print(len(single_index_train_data))

In [None]:
single_index_train_data[0]["annotation"][0][2]

In [None]:
#!/usr/bin/env python3
#-*- codig: utf-8 -*-
import sys
import requests
import json
client_id = "a7s6l6bzgo"
client_secret = "6vQUTEXthtxXXm7PCVQji2p1dnMOlS2ycntQB3hy"
url="https://naveropenapi.apigw.ntruss.com/sentiment-analysis/v1/analyze"
headers = {
    "X-NCP-APIGW-API-KEY-ID": client_id,
    "X-NCP-APIGW-API-KEY": client_secret,
    "Content-Type": "application/json"
}

for pair in tqdm(single_index_train_data):
    if pair["annotation"][0][2] == "negative" or pair["annotation"][0][2] == "neutral":
        content = pair["sentence_form"]
        data = {
          "content": content
        }
        response = requests.post(url, data=json.dumps(data), headers=headers)
        rescode = response.status_code
        if(rescode == 200):
            text = eval(response.text)
            sentiment = text["document"]["sentiment"]
            pair["annotation"][0][2] = sentiment
        else:
            print("Error : " + response.text)

In [None]:
print(len(train_data))

In [None]:
polarity_id_to_name = ['positive', 'negative', 'neutral']

polarity_single_count = {polarity_id_to_name[i]: 0 for i in range(len(polarity_id_to_name))}
polarity_double_count = {polarity_id_to_name[i]: 0 for i in range(len(polarity_id_to_name))}

In [None]:
for pair in single_index_train_data:
    for detail in pair["annotation"]:
        for polarity in polarity_id_to_name:
            if polarity == detail[2]:
                polarity_single_count[polarity] += 1
polarity_single_count

In [None]:
for pair in double_index_train_data:
    for detail in pair["annotation"]:
        for polarity in polarity_id_to_name:
            if polarity == detail[2]:
                polarity_double_count[polarity] += 1
polarity_double_count

In [None]:
polarity_count = {polarity_id_to_name[i]: 0 for i in range(len(polarity_id_to_name))}
for pair in train_data:
    for detail in pair["annotation"]:
        for polarity in polarity_id_to_name:
            if polarity == detail[2]:
                polarity_count[polarity] += 1
polarity_count

In [None]:
df = pd.DataFrame(columns = ["id", "sentence_form", "entity", "polarity"])

In [None]:
df

In [None]:
for i,pair in enumerate(single_index_train_data):
    df.at[i, "id"] = pair["id"]
    df.at[i, "sentence_form"] = pair["sentence_form"]
    df.at[i, "entity"] = pair["annotation"][0][0]
    df.at[i, "polarity"] = pair["annotation"][0][2]

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

target = df["polarity"]
df_train, df_val = train_test_split(df, test_size = 0.1, shuffle=True, stratify=target, random_state=34)

In [None]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [None]:
train = []
dev = []

for i in range(len(df_train)):
    train.append({"id": df_train.at[i, 'id'], "sentence_form": df_train.at[i, 'sentence_form'], "annotation": [[df_train.at[i, 'entity'], [], df_train.at[i, 'polarity']]]})
    
for i in range(len(df_val)):
    dev.append({"id": df_val.at[i, 'id'], "sentence_form": df_val.at[i, 'sentence_form'], "annotation": [[df_val.at[i, 'entity'], [], df_val.at[i, 'polarity']]]})

In [None]:
def jsonl_store(raw_data, fname, encoding="utf-8"):
    with open(f"{fname}", encoding=encoding, mode="w") as f:
        for raw in raw_data:
            f.write(json.dumps(raw, ensure_ascii=False) + "\n")

In [None]:
train_name = "./Preprocessed_data/asc/train_asc_9_1_emoji_not_removed_clova.jsonl"
dev_name = "./Preprocessed_data/asc/dev_asc_9_1_emoji_not_removed_clova.jsonl"

In [None]:
jsonl_store(train, train_name)
jsonl_store(dev, dev_name)