In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!source /content/drive/MyDrive/colab_env/bin/activate

import sys
sys.path.append("/content/drive/MyDrive/colab_env/lib/python3.10/site-packages")

In [3]:
import os
import glob
import jieba
import spacy
import opencc
import string
import gensim
import requests
import transformers
import spacy_transformers

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from spacy.tokens import DocBin
from stopwordsiso import stopwords
from gensim.test.utils import datapath
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec, KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.pipeline.textcat_multilabel import Config, multi_label_cnn_config

In [None]:
os.chdir('/content/drive/MyDrive/Capstone/team_player/')

In [None]:
file = glob.glob('./player/*.txt')

for t in file:
    jieba.load_userdict(t)

In [None]:
team_file = './team/team.txt'
jieba.load_userdict(team_file)

In [None]:
os.chdir('/content/drive/MyDrive/Capstone/Spacy/')

In [None]:
df = pd.read_csv('./data/chat_df_w_file_usr_time_text.csv')
df_emoji = pd.read_csv('./data/df_emoji.csv', encoding = 'utf-8')
df_remove_emoji = pd.read_csv('./data/df_remove_emoji.csv', encoding = 'utf-8')
df_emoji_to_desc = pd.read_csv('./data/df_emoji_to_desc.csv', encoding = 'utf-8')

In [None]:
df_no_en = df_remove_emoji[df_remove_emoji['text'].astype(str).apply(lambda x: not x.encode('utf-8').isascii())]

In [None]:
print ("Length of df: ", len(df))
print ("Length of df_no_en: ", len(df_no_en))

Length of df:  122076
Length of df_no_en:  100624


In [None]:
df['text'] = df['text'].astype(str)

In [None]:
stop_words_combined = list(stopwords(["zh"]))

cc = opencc.OpenCC('s2t.json')
stopword = []

for i in stop_words_combined:
    stopword.append(cc.convert(i))

In [None]:
jieba.load_userdict('./dict.txt.big')

In [None]:
def remove_punctuation(text):
    translator = str.maketrans("", "", string.punctuation + "！？&#8203;``【oaicite:0】``&#8203;（）［］《》、，。；：‘“’”…￥·")
    text_without_punct = text.translate(translator)
    return text_without_punct

def jieba_cut(text):
    if pd.isna(text):
        return []

    text_without_punct = remove_punctuation(text)
    seg_list = [seg for seg in jieba.cut(text_without_punct) if seg.strip()]

    return seg_list

def tokenizer(doc):

    seg = jieba_cut(doc)
    filtered_seg = [word for word in seg if(word not in stopword) and (len(word) > 1)]

    return filtered_seg

## Only Chat message, and seperated by senetence

In [None]:
chat = []
for index, row in df_no_en.iterrows():
    segments = tokenizer(row['text'])
    if len(segments) > 0:
        chat.append(segments)

In [None]:
model = Word2Vec(chat, vector_size = 300, min_count = 3, epochs = 10, sg = 1)
model.save('w2v_1213.model')

In [15]:
model = KeyedVectors.load('./word2vec/w2v_1213.model')

In [16]:
keyword = '卡拉'

res = model.wv.similar_by_word(keyword, topn = 20)
for item in res:
    print(item)

('魚丸', 0.7916378974914551)
('Crl', 0.7568725347518921)
('書荷', 0.7297079563140869)
('接發球', 0.7218419909477234)
('素琴', 0.713817834854126)
('rl', 0.6957876682281494)
('幼群', 0.6942238807678223)
('冠宇', 0.6937622427940369)
('Bryn', 0.6886841654777527)
('隊友', 0.6885063052177429)
('bryn', 0.6858274936676025)
('Sns', 0.6849724054336548)
('vr', 0.682137668132782)
('能者', 0.6809704303741455)
('小黑', 0.675384521484375)
('過勞', 0.6736312508583069)
('Al', 0.6730499863624573)
('Sr', 0.6725677251815796)
('血汗', 0.6724375486373901)
('gb', 0.6719704270362854)


## Chat message + News, chat message separated by sentence

In [None]:
url = 'https://tvl.ctvba.org.tw/news'
response = requests.get(url)
news = {'URL': [], 'News': []}

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    news_links = soup.find_all('a', href = lambda href: href and "/news-detail/" in href)

    for link in news_links:
        news_url = 'https://tvl.ctvba.org.tw' + link['href']
        news_response = requests.get(news_url)

        if news_response.status_code == 200:
            news_soup = BeautifulSoup(news_response.text, 'html.parser')

            description_meta = news_soup.find('meta', {'name': 'description'})

            if description_meta:
                description_content = description_meta['content']

                news['URL'].append(news_url)
                news['News'].append(description_content)
else:
    print('Failed to retrieve the page. Status Code:', response.status_code)

news = pd.DataFrame(news)

In [None]:
news = news.drop_duplicates(subset = ['News']).reset_index(drop = True)
news['News'] = news['News'].str.replace('\r\n', '')
news['News'] = news['News'].str.replace('\xa0', '')

In [None]:
news = news[:313]

In [None]:
# news.to_csv('./data/TVLNews.csv', index = False)

In [None]:
news = pd.read_csv('./data/TVLNews.csv')

In [None]:
chat_news = []

for index, row in df_no_en.iterrows():
    segments = tokenizer(row['text'])
    if len(segments) > 0:
        chat_news.append(segments)

for index, row in news.iterrows():
    segments = tokenizer(row['News'])
    if len(segments) > 0:
        chat_news.append(segments)

In [None]:
model = Word2Vec(chat_news, vector_size = 300, min_count = 3, epochs = 10, sg = 1)
model.save('./word2vec/w2v_chat_news.model')

In [5]:
os.chdir('/content/drive/MyDrive/Capstone/Spacy/')

In [6]:
model = KeyedVectors.load('./word2vec/w2v_chat_news.model')

In [9]:
keyword = '卡拉'

res = model.wv.similar_by_word(keyword, topn = 20)
for item in res:
    print(item)

('魚丸', 0.8254485130310059)
('素琴', 0.7413310408592224)
('rl', 0.6955834031105042)
('gb', 0.6888911128044128)
('書荷', 0.6778160929679871)
('可怕', 0.670377254486084)
('bryn', 0.6687530875205994)
('常常', 0.6660739779472351)
('BIA', 0.6657554507255554)
('小黑', 0.6647905111312866)
('好不好', 0.6636867523193359)
('修正', 0.6578335165977478)
('血汗', 0.6526831388473511)
('金髮', 0.6489719748497009)
('超跑', 0.6489390134811401)
('過勞', 0.6486817002296448)
('軒岑', 0.6467099189758301)
('能者', 0.6433789134025574)
('洋將', 0.6418787837028503)
('阿亮', 0.6414356827735901)


## Chat message + News, chat message seperated by game

In [None]:
df_chat_by_game = df_no_en.groupby('file')['text'].agg(lambda x: ' '.join(x)).reset_index()
df_chat_by_game.columns = ['file', 'text']

df_chat_by_game

Unnamed: 0,file,text
0,chat_0sj9YWqKaKU.json,問阿勇啊 敲碗碗敲 台電冠軍正良加油 大星的腳好像恢復差不多了 哇哇哇哇 連莊 但還是健康...
1,chat_1QREcdAdI9I.json,終於換主播了 来啦 愛山林加油 中纎冠军 中纖加油加油 愛山林加油 聲音好好聽喔 中纖加油啊...
2,chat_1Xw3Kmbd_a8.json,小温加油美津濃加油 美津濃加油 加油 美津濃加油 阿甘真的加了 阿甘還沒上場 發哥害羞還真可...
3,chat_1okGy1BoNkQ.json,要開始了 主播連播三場 庭葳加油 庭葳 蔓亞加油 愛山林水哦 哇兩大炮外援鯨華這樣不好打喔 ...
4,chat_2DL4e4-Xqv4.json,來了 愛山林加油 Yeah 我愛的主播 太會找鏡頭了 後面的 我高雄我驕傲 地主隊 有點尷尬...
...,...,...
112,chat_xNiwDOYVRTI.json,小編午安 午安 愛山林加油 午安啦 蔡幼群我來了 沒有愛山林耶 16敗有點難過 幫忙加油...
113,chat_xdMF3S5YLqk.json,台中加油 加油 加油 連莊加油 亞理 康提幫我撐2小時 連莊加油 連莊加油 連莊加油啊啊啊 ...
114,chat_yGwx7ISumjE.json,到麥當勞要買肯德基的意思嗎 笑死 就跟在這邊喊小戴加油是一樣道理 我家中華的網路出問題了請問...
115,chat_zpHKKmwHvNg.json,鯨華加油 鯨華加油 鯨華加油 鯨華加油 詩涵今年猛猛的 幼幼 今年大爆發呢 鯨華加油 今天又...


In [None]:
chat_by_game = []

for index, row in df_chat_by_game.iterrows():
    segments = tokenizer(row['text'])
    if len(segments) > 0:
        chat_by_game.append(segments)

for index, row in news.iterrows():
    segments = tokenizer(row['News'])
    if len(segments) > 0:
        chat_by_game.append(segments)

In [None]:
model = Word2Vec(chat_by_game, vector_size = 300, min_count = 3, epochs = 10, sg = 1)
model.save('./word2vec/w2v_chat_by_game_news.model')

In [10]:
model = KeyedVectors.load('./word2vec/w2v_chat_by_game_news.model')

In [12]:
keyword = '卡拉'

res = model.wv.similar_by_word(keyword, topn = 20)
for item in res:
    print(item)

('rl', 0.7151620388031006)
('能者', 0.7010477185249329)
('過勞', 0.6870222091674805)
('血汗', 0.6797981858253479)
('Crl', 0.6676456928253174)
('魚丸', 0.6526923775672913)
('黃素', 0.6444175839424133)
('Al', 0.5996392965316772)
('虛脫', 0.592359185218811)
('引擎', 0.5909104943275452)
('軒岑', 0.590481162071228)
('特斯拉', 0.5901666283607483)
('超跑', 0.5860097408294678)
('喀拉', 0.5852321982383728)
('起跑', 0.5836528539657593)
('小幼', 0.5768735408782959)
('阿非', 0.5757794380187988)
('丁柔安', 0.574275016784668)
('阿菲', 0.5671363472938538)
('氣氣', 0.5645719170570374)
