## Everything Beforehand

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!source /content/drive/MyDrive/colab_env/bin/activate

import sys
sys.path.append("/content/drive/MyDrive/colab_env/lib/python3.10/site-packages")

In [None]:
!pip install transformers==4.30.2

In [5]:
import os
import re
import json
import emoji
import spacy
import locale
import warnings
import pandas as pd
import transformers
import spacy_transformers
import matplotlib.pyplot as plt
from io import StringIO
from spacy.tokens import DocBin
from transformers import pipeline
from spacy.training import Example
from sklearn.model_selection import train_test_split
from spacy.pipeline.textcat_multilabel import Config, multi_label_cnn_config

In [6]:
%config InlineBackend.figure_format = 'retina'

In [7]:
locale.getpreferredencoding = lambda: "UTF-8"

In [8]:
warnings.filterwarnings('ignore')

## Preprocessing

In [None]:
emoji_list = emoji.EMOJI_DATA

In [19]:
def get_json(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as file:
        content = json.load(file, strict = False)
    return content

In [None]:
def rmemoji(text):
    return emoji.replace_emoji(text, '').strip()

def emoji2description(text):
    return emoji.replace_emoji(text, replace = lambda chars, data_dict: ' ' + ' '.join(data_dict['zh'].split('_')).strip(':') + ' ')

def remove_punctuation_regex(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

In [9]:
os.chdir('/content/drive/MyDrive/Capstone/Spacy')

In [None]:
df = pd.read_csv('./data/df_emoji.csv')

In [None]:
print(len(df))
df.drop_duplicates(subset = ['text'], keep = 'last', inplace = True)
df['text'] = df['text'].replace(' ', '')
df = df[(df['text'] != '') & (df['text'].notnull())]
print(len(df))

## Labelling Data

In [None]:
df['text'] = df['text'].astype(str)

In [None]:
df.reset_index(inplace = True)

In [None]:
clf = pipeline('text-classification', model = "lxyuan/distilbert-base-multilingual-cased-sentiments-student", device = 0)

df['label_1'] = ''
df['score_1'] = ''

for i in range(len(df)):
    if isinstance(df['text'][i], str):
        result = clf(df['text'][i])

        if result[0]['score'] > 0.5:
            df.loc[i, 'label_1'] = result[0]['label']
            df.loc[i, 'score_1'] = result[0]['score']

    else:
        print(f"Ignoring invalid data: {df['text'][i]}")

config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
clf = pipeline('text-classification', model = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment", device = 0)

df['label_2'] = ''
df['score_2'] = ''

for i in range(len(df)):
    if isinstance(df['text'][i], str):
        result = clf(df['text'][i])

        if result[0]['score'] > 0.5:
            df.loc[i, 'label_2'] = result[0]['label']
            df.loc[i, 'score_2'] = result[0]['score']

    else:
        print(f"Ignoring invalid data: {df['text'][i]}")

config.json:   0%|          | 0.00/786 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]



In [None]:
clf = pipeline("zero-shot-classification", model = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli", device = 0)

candidate_labels = ['positive', 'negative', 'neutral']
df['label_3'] = ''
df['score_3'] = ''

for i in range(len(df)):
    if isinstance(df['text'][i], str):
        result = clf(df['text'][i], candidate_labels, multi_label = False)

        if result['scores'][0] > 0.5:
            df.loc[i, 'label_3'] = result['labels'][0]
            df.loc[i, 'score_3'] = result['scores'][0]

        if i % 10000 == 0:
            print(f'Progress: {i/len(df) * 100} %')
    else:
        print(f"Ignoring invalid data: {df['text'][i]}")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/463 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Progress: 0.0 %
Progress: 10.619431435640935 %
Progress: 21.23886287128187 %
Progress: 31.858294306922808 %
Progress: 42.47772574256374 %
Progress: 53.09715717820468 %
Progress: 63.716588613845616 %
Progress: 74.33602004948655 %
Progress: 84.95545148512748 %
Progress: 95.57488292076842 %


In [None]:
df.to_csv('./Chat_Layer/label_df.csv', index = False)

### Deal w/ training data

In [10]:
df = pd.read_csv('./Chat_Layer/label_df.csv')

In [11]:
df

Unnamed: 0,index,file,user,time,text,label_1,score_1,label_2,score_2,label_3,score_3
0,0,chat_1Xw3Kmbd_a8.json,章魚燒,1:25,小温加油！美津濃加油🔥🔥,positive,0.720334,Positive,0.999955,neutral,0.535499
1,4,chat_1Xw3Kmbd_a8.json,Ppppppj,2:31,阿甘真的加了,positive,0.643465,Positive,0.998268,positive,0.919085
2,5,chat_1Xw3Kmbd_a8.json,makubex7788,3:58,⚡⚡☁☁💪💪💪,,,Negative,0.942496,,
3,6,chat_1Xw3Kmbd_a8.json,birdsban,5:19,阿甘還沒上場,,,Negative,0.878662,,
4,7,chat_1Xw3Kmbd_a8.json,Xiang yun Han,5:58,發哥害羞還真可愛,positive,0.762739,Positive,0.999926,positive,0.827649
...,...,...,...,...,...,...,...,...,...,...,...
94162,121306,chat_SAgpRa1oSUA.json,村田千夜,2:42:45,主播賽評轉播單位辛苦了,,,Positive,0.999976,negative,0.643242
94163,121307,chat_SAgpRa1oSUA.json,啟恆,2:42:48,中原斷電 只能靠太陽能,,,Negative,0.883617,,
94164,121308,chat_SAgpRa1oSUA.json,村田千夜,2:42:57,臺產超棒,positive,0.977848,Positive,0.999972,positive,0.975363
94165,121309,chat_SAgpRa1oSUA.json,Chen Darren,2:43:13,你很會說欸,positive,0.612695,Positive,0.997750,,


In [12]:
df['label_2'] = df['label_2'].map({'Positive': 'positive', 'Negative': 'negative'})

pos = df[(df[['label_1', 'label_2', 'label_3']] == 'positive').all(axis = 1)]
neg = df[(df[['label_1', 'label_2', 'label_3']] == 'negative').all(axis = 1)]
neu = df[(df[['label_1', 'label_3']] == 'neutral').all(axis = 1)]

In [13]:
df_pos = pd.DataFrame({
    'index': pos['index'],
    'TEXT': pos['text'],
    'LABEL': 'positive',
    'SCORE': (pos['score_1'] + pos['score_2'] + pos['score_3']) / 3
})


df_neg = pd.DataFrame({
    'index': neg['index'],
    'TEXT': neg['text'],
    'LABEL': 'negative',
    'SCORE': (neg['score_1'] + neg['score_2'] + neg['score_3']) / 3
})

df_neu = pd.DataFrame({
    'index': neu['index'],
    'TEXT': neu['text'],
    'LABEL': 'neutral',
    'SCORE': (neu['score_1'] + neu['score_3']) / 2
})

### Selecting Training data

In [27]:
df_pos.sort_values(by = ['SCORE'], ascending = False, inplace = True)  ## PR50 = 0.884149  ## Mean = 0.870227  ## Num = 21946
df_neg.sort_values(by = ['SCORE'], ascending = False, inplace = True)  ## PR50 = 0.833475  ## Mean = 0.828920  ## Num = 11059
df_neu.sort_values(by = ['SCORE'], ascending = False, inplace = True)  ## PR50 = 0.639869  ## Mean = 0.650717  ## Num = 1018

In [28]:
df_pos['SCORE'] = df_pos['SCORE'].astype(float)
df_neg['SCORE'] = df_neg['SCORE'].astype(float)
df_neu['SCORE'] = df_neu['SCORE'].astype(float)

In [29]:
print (len(df_pos[df_pos['SCORE'] > 0.9]))
print (len(df_neg[df_neg['SCORE'] > 0.9]))
print (len(df_neu[df_neu['SCORE'] > 0.75]))

9944
2404
149


In [34]:
train_pos = df_pos[:2000]  ## 0.981506
train_neg = df_neg[:2000]  ## 0.910042
train_neu = df_neu[:150]   ## 0.749801

In [37]:
os.chdir('/content/drive/MyDrive/Capstone/Spacy/Chat_Layer/data')

In [38]:
label_df = pd.concat([df_pos, df_neg, df_neu])
label_df.to_csv('label_df.csv', index = False)

In [39]:
train_df = pd.concat([train_pos, train_neg, train_neu])
train_df.to_csv('train_df.csv', index = False)

## Training Process

In [None]:
# !pip install transformers==4.30.2
# print(transformers.__version__)

4.30.2


In [41]:
def get_json(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as file:
        content = json.load(file, strict = False)
    return content

In [42]:
def rmemoji(text):
    return emoji.replace_emoji(text, '').strip()

def emoji2description(text):
    return emoji.replace_emoji(text, replace = lambda chars, data_dict: ' ' + ' '.join(data_dict['zh'].split('_')).strip(':') + ' ')

def remove_punctuation_regex(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

<h4> Remove Emoji

In [43]:
df_rmemoji = train_df.copy()
print(len(df_rmemoji))
df_rmemoji['TEXT'] = df_rmemoji['TEXT'].astype(str).apply(rmemoji)
df_rmemoji['TEXT'] = df_rmemoji['TEXT'].replace(' ', '')
df_rmemoji = df_rmemoji[(df_rmemoji['TEXT'] != '') & (df_rmemoji['TEXT'].notnull())]
print(len(df_rmemoji))

4150
4150


<h4> Emoji2Desc

In [44]:
df_emoji2desc = train_df.copy()
print(len(df_emoji2desc))
df_emoji2desc['TEXT'] = df_emoji2desc['TEXT'].astype(str).apply(emoji2description)
df_rmemoji['TEXT'] = df_rmemoji['TEXT'].replace(' ', '')
df_emoji2desc = df_emoji2desc[(df_emoji2desc['TEXT'] != '') & (df_emoji2desc['TEXT'].notnull())]
print(len(df_emoji2desc))

4150
4150


<h4>Remove punctuation

In [45]:
df_rmpunc = train_df.copy()
print(len(df_rmpunc))
df_rmpunc['TEXT'] = df_rmpunc['TEXT'].astype(str).apply(remove_punctuation_regex)
df_rmemoji['TEXT'] = df_rmemoji['TEXT'].replace(' ', '')
df_rmpunc = df_rmpunc[(df_rmpunc['TEXT'] != '') & (df_rmpunc['TEXT'].notnull())]
print(len(df_rmpunc))

4150
4150


### Load Model

In [None]:
# !python -m spacy download zh_core_web_trf  ## Blank model ## Only Download when opening a new task

In [None]:
nlp = spacy.load('zh_core_web_trf')

config = Config().from_str(multi_label_cnn_config)

text_cat = nlp.add_pipe("textcat_multilabel", config = config)

labels = ['positive', 'negative', 'neutral']

for i in labels:
    text_cat.add_label(i)

### Load PyTorch dataset

In [None]:
y = pd.get_dummies(train_df.LABEL)
y = y.to_dict('index')

dataset = list(zip(train_df['TEXT'],[{'cats': cats} for cats in y.values()]))

In [None]:
train_df, test_df = train_test_split(train_df, train_size = 0.7, random_state = 42)
train_data, test_data = train_test_split(dataset, train_size = 0.7, random_state = 42)

In [None]:
ind = list(test_df['index'])

In [None]:
df_rmemoji = df_rmemoji[df_rmemoji['index'].isin(ind)]
df_emoji2desc = df_emoji2desc[df_emoji2desc['index'].isin(ind)]
df_rmpunc = df_rmpunc[df_rmpunc['index'].isin(ind)]

In [None]:
y_rmemoji = pd.get_dummies(df_rmemoji.LABEL)
y_rmemoji = y_rmemoji.to_dict('index')

dataset_rmemoji = list(zip(df_rmemoji['TEXT'],[{'cats': cats} for cats in y_rmemoji.values()]))

In [None]:
dataset_rmemoji[0]

In [None]:
y_emoji2desc = pd.get_dummies(df_emoji2desc.LABEL)
y_emoji2desc = y_emoji2desc.to_dict('index')

dataset_emoji2desc = list(zip(df_emoji2desc['TEXT'],[{'cats': cats} for cats in y_emoji2desc.values()]))

In [None]:
dataset_emoji2desc[0]

In [None]:
y_rmpunc = pd.get_dummies(df_rmpunc.LABEL)
y_rmpunc = y_rmpunc.to_dict('index')

dataset_rmpunc = list(zip(df_rmpunc['text'],[{'cats': cats} for cats in y_rmpunc.values()]))

In [None]:
dataset_rmpunc[0]

In [None]:
def make_docs(data):
    docs = []
    for text, annotations in data:
        if isinstance(text, str) and isinstance(annotations, dict):
            doc = nlp.make_doc(text)
            doc.cats = annotations.get("cats", {})
            docs.append(doc)
        else:
            print(f"Ignoring invalid data: {text}, {annotations}")
    return docs

In [None]:
os.chdir('/content/drive/MyDrive/Capstone/Chat_Layer')

In [None]:
train_docs = make_docs(train_data[:])
doc_bin_train = DocBin(docs = train_docs)
doc_bin_train.to_disk("./data/train.spacy")

In [None]:
valid_docs = make_docs(test_data[:])
doc_bin_valid = DocBin(docs = valid_docs)
doc_bin_valid.to_disk("./data/valid.spacy")

In [None]:
valid_docs = make_docs(dataset_rmemoji[:])
doc_bin_valid = DocBin(docs = valid_docs)
doc_bin_valid.to_disk("./data/valid_rmemoji.spacy")

In [None]:
valid_docs = make_docs(dataset_emoji2desc[:])
doc_bin_valid = DocBin(docs = valid_docs)
doc_bin_valid.to_disk("./data/valid_emoji2desc.spacy")

In [None]:
valid_docs = make_docs(dataset_rmpunc[:])
doc_bin_valid = DocBin(docs = valid_docs)
doc_bin_valid.to_disk("./data/valid_rmpunc.spacy")

### Training

In [None]:
# !python -m spacy init fill-config ./base_config.cfg ./config.cfg

In [None]:
!python -m spacy train config.cfg --output ./output --gpu-id 0 --paths.train ./data/train.spacy --paths.dev ./data/valid.spacy

### Evaluation

In [None]:
!python -m spacy evaluate output/model-best/ --output ./output/metrics.json ./data/valid.spacy --gpu-id 0

In [None]:
!python -m spacy evaluate output/model-best/ --output ./output/metrics_rmemoji.json ./data/valid_rmemoji.spacy --gpu-id 0

In [None]:
!python -m spacy evaluate output/model-best/ --output ./output/metrics_emoji2desc.json ./data/valid_emoji2desc.spacy --gpu-id 0

In [None]:
!python -m spacy evaluate output/model-best/ --output ./output/metrics_rmpunc.json ./data/valid_rmpunc.spacy --gpu-id 0

## Performance Analysis

In [17]:
os.chdir('./Chat_Layer/output')

In [20]:
score = get_json('./metrics.json')
score_rmemoji = get_json('./metrics_rmemoji.json')
score_emoji2desc = get_json('./metrics_emoji2desc.json')
score_rmpunc = get_json('./metrics_rmpunc.json')

In [21]:
p = ['cats_score', 'cats_macro_p', 'cats_macro_r', 'cats_macro_f']

performance_raw = pd.DataFrame({
    'SCORE': score[p[0]],
    'MARCO_P': score[p[1]],
    'MARCO_R': score[p[2]],
    'MARCO_F1': score[p[3]]
}, index = ['RAW'])

performance_rmemoji = pd.DataFrame({
    'SCORE': score_rmemoji[p[0]],
    'MARCO_P': score_rmemoji[p[1]],
    'MARCO_R': score_rmemoji[p[2]],
    'MARCO_F1': score_rmemoji[p[3]]
}, index = ['Remove Emoji'])

performance_emoji2desc = pd.DataFrame({
    'SCORE': score_emoji2desc[p[0]],
    'MARCO_P': score_emoji2desc[p[1]],
    'MARCO_R': score_emoji2desc[p[2]],
    'MARCO_F1': score_emoji2desc[p[3]]
}, index = ['Emoji to Description'])

performance_rmpunc = pd.DataFrame({
    'SCORE': score_rmpunc[p[0]],
    'MARCO_P': score_rmpunc[p[1]],
    'MARCO_R': score_rmpunc[p[2]],
    'MARCO_F1': score_rmpunc[p[3]]
}, index = ['Remove Punctuation'])

performance = pd.concat([performance_raw, performance_rmemoji, performance_emoji2desc, performance_rmpunc])
performance.index.name = 'METHOD'

In [22]:
performance.style.highlight_max(axis = 0, color = 'red').highlight_min(axis = 0, color = 'lightblue')

Unnamed: 0_level_0,SCORE,MARCO_P,MARCO_R,MARCO_F1
METHOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RAW,0.999969,0.98316,0.990559,0.986817
Remove Emoji,0.999981,0.991312,0.998311,0.994768
Emoji to Description,0.999898,0.989309,0.981696,0.985458
Remove Punctuation,0.999239,0.991312,0.998311,0.994768


In [None]:
# performance.to_csv('prf_cf_emoji.csv')

In [23]:
cats_f_per_type = score['cats_f_per_type']
df_cats_f_per_type = pd.DataFrame(cats_f_per_type).T
df_cats_f_per_type = df_cats_f_per_type.rename(columns = {'p': 'PRECISION', 'r': 'RECALL', 'f': 'F1'})

df_cats_f_per_type.index.name = 'CATEGORY'

In [24]:
df_cats_f_per_type.style.highlight_max(axis = 0, color = 'red').highlight_min(axis = 0, color = 'lightblue')

Unnamed: 0_level_0,PRECISION,RECALL,F1
CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,0.996534,0.996534,0.996534
neutral,0.954545,0.976744,0.965517
positive,0.9984,0.9984,0.9984


In [None]:
# df_cats_f_per_type.to_csv('prf_cf_cats.csv')

In [25]:
cats_f_per_type_score = score['cats_f_per_type']
cats_f_per_type_rmemoji = score_rmemoji['cats_f_per_type']
cats_f_per_type_emoji2desc = score_emoji2desc['cats_f_per_type']
cats_f_per_type_rmpunc = score_rmpunc['cats_f_per_type']

df_cats_f_per_type_score = pd.DataFrame(cats_f_per_type_score).T
df_cats_f_per_type_rmemoji = pd.DataFrame(cats_f_per_type_rmemoji).T
df_cats_f_per_type_emoji2desc = pd.DataFrame(cats_f_per_type_emoji2desc).T
df_cats_f_per_type_rmpunc = pd.DataFrame(cats_f_per_type_rmpunc).T

df_cats_f_per_type_score = df_cats_f_per_type_score.rename(columns={'f': 'RAW'})
df_cats_f_per_type_rmemoji = df_cats_f_per_type_rmemoji.rename(columns={'f': 'Remove Emoji'})
df_cats_f_per_type_emoji2desc = df_cats_f_per_type_emoji2desc.rename(columns={'f': 'Emoji to Description'})
df_cats_f_per_type_rmpunc = df_cats_f_per_type_rmpunc.rename(columns={'f': 'Remove punctuation'})

df_cats = pd.concat([df_cats_f_per_type_score[['RAW']], df_cats_f_per_type_rmemoji[['Remove Emoji']], df_cats_f_per_type_emoji2desc[['Emoji to Description']], df_cats_f_per_type_rmpunc[['Remove punctuation']]], axis = 1)
df_cats.index.name = 'F1 score'

In [26]:
df_cats.style.highlight_max(axis = 1, color = 'red').highlight_min(axis = 1, color = 'lightblue')

Unnamed: 0_level_0,RAW,Remove Emoji,Emoji to Description,Remove punctuation
F1 score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,0.996534,0.997398,0.995663,0.997398
neutral,0.965517,0.988506,0.964706,0.988506
positive,0.9984,0.9984,0.996003,0.9984


In [None]:
# df_cats.to_csv('f1_cf_cats_emoji.csv')