In [None]:
from google.colab import files
import json
import re

import pandas as pd
import numpy as np
from pandas.core.frame import DataFrame
from sklearn.utils import shuffle

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics

In [None]:
!pip install transformers 
from transformers import BertTokenizer, BertModel,BertConfig



In [None]:
def upload_file_to_colab():
  uploaded = files.upload()
  for fileName in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes.'.format(name=fileName, length=len(uploaded[fileName])))

In [None]:
# 三星寫手門資料集
# upload train.json
print('Upload train.json please.\n')
upload_file_to_colab()

# upload test.json
print('Upload test.json please.\n')
upload_file_to_colab()

Upload train.json please.



Saving train.json to train.json
User uploaded file "train.json" with length 8695710 bytes.
Upload test.json please.



Saving test.json to test.json
User uploaded file "test.json" with length 4098518 bytes.


In [None]:
def sentence_regex(sentence):
  complie = re.compile(u"[\u4e00-\u9fa5]+")

  sentence = sentence.replace("\n", ' ')
  sentence = re.sub(u'[a-zA-Z0-9]', '', sentence)

  sentence = re.findall(complie, sentence) #一則評論只取中文字，分成多行
  return ''.join(sentence)   

In [None]:
def remove_empty_row(df, col='text'):
  emptyTitleFIlter = (
        (df[col].isnull())|(df[col] == '')|(df[col] == '0')
        )
  df = df[~emptyTitleFIlter]
  return df

In [None]:
def label_to_spam(is_spam):
  if is_spam == True:
      return 1 
  return 0

In [None]:
def preprocessing_data(filePath):
    data = pd.read_json(filePath)

    contents = list()
    spamLabels = list()
    length = len(data)

    for idx in range(length):
      sentence = data['content'][idx]
      is_spam = data['is_spam'][idx]

      sentence = sentence_regex(sentence)
      contents.append(sentence)
      
      is_spam = label_to_spam(is_spam)
      spamLabels.append(is_spam)

    return contents, spamLabels

In [None]:
train_contents, train_spamLabels = preprocessing_data('train.json')
print('* Train File:\n\tFirst sentence:', train_contents[0], '\n\tFirst label:', train_spamLabels[0])

test_contents, test_spamLabels = preprocessing_data('test.json')
print('* Test File:\n\tFirst sentence:', test_contents[0], '\n\tFirst label:', test_spamLabels[0])

* Train File:
	First sentence: 今天在亞太問到空機價是一萬九三月中上市要等嗎 
	First label: 1
* Test File:
	First sentence: 前陣子才購入心愛的珊瑚粉用到現在覺得還滿上手的借給朋友玩他們都驚呼速度很快順暢無比不會這不經讓我小小得意一下呵呵笑為了讓好好發揮最大效能衝一下因此安裝了許多遊戲工具導航照相的不過問題就由此而生啦出門在外使用這些電力一下就不足了有時候又找不到可以充電的地方沒電真的是很無奈囧開始想要添購行動電源的念頭搜尋一下發現有好多種牌子有些價錢便宜到誇張不敢下手希望是看起來有質感且大容量的行動電源不知道大家有什麼好的推薦嗎 
	First label: 1


In [None]:
trainDict = {
    'text': train_contents,
    'label': train_spamLabels
}

trainDf = DataFrame(trainDict)
trainDf = remove_empty_row(trainDf, 'text')
trainDf.head()

Unnamed: 0,text,label
0,今天在亞太問到空機價是一萬九三月中上市要等嗎,1
1,之前在亞太的上看到即將上市的訊息後之後就都是謠言了門市人員也不確定上市時間讓人一直懸在那期待...,1
2,星期六的午後新竹南寮漁港旁雲水一方庭園餐廳銀河帶著他的筆記來拜訪薄薄的一本大大的面子亮亮的眼...,1
3,昨天去中華門市玩了一下手機想說趁最近資訊展的時候入手一支應該可以便宜一點不過玩了半天好像都沒...,1
4,體驗會時間年月日日開始入場首先感謝和舉辦這次的體驗會我很榮幸的能夠成為台中場的體驗者接下來我...,1


In [None]:
testDict = {
    'text': test_contents,
    'label': test_spamLabels
}

testDf = DataFrame(testDict)
testDf = remove_empty_row(testDf, 'text')
testDf.head()

Unnamed: 0,text,label
0,前陣子才購入心愛的珊瑚粉用到現在覺得還滿上手的借給朋友玩他們都驚呼速度很快順暢無比不會這不經...,1
1,慕名已久的機皇風評一直都是很不錯正猶豫是否下手的時候就聽說要推出新的顏色所以又再延遲了一段時...,1
2,繼上次參加過新竹體驗會後這次是第二次參加三星的體驗會雖然地點在台北其實高鐵捷運還算方便還是不...,1
3,看到大家在砲轟早鳥禮才發現原來這麼多人都那麼看重贈品我覺得廠商給你再多再好的贈品最後還是會反...,1
4,有人跟我一樣在等的嗎來簽名報到一下吧順便看看這產品的人氣期待度微冷笑其實剛好明天休假,1


In [None]:
train_true_filter = trainDf['label'] == 0
train_fake_filter = trainDf['label'] == 1

test_true_filter = testDf['label'] == 0
test_fake_filter = testDf['label'] == 1


true_train_count = len(trainDf.loc[train_true_filter])
true_test_count = len(testDf.loc[test_true_filter])
print('True Comment Count:', true_train_count + true_test_count)

fake_train_count = len(trainDf.loc[train_fake_filter])
fake_test_count = len(testDf.loc[test_fake_filter])
print('Fake Comment Count:', fake_train_count + fake_test_count)

True Comment Count: 15959
Fake Comment Count: 749


In [None]:
def concat_two_dataframe(df_1, df_2):
  concated = pd.concat([df_1, df_2], axis=0)
  concated = shuffle(concated)
  # reset index
  concated = concated.reset_index(drop=True)
  return concated

In [None]:
# concated train dataframe and test dataframe
concated_df = concat_two_dataframe(trainDf, testDf)

fake_label_filter = concated_df['label'] == 1
true_label_filter = concated_df['label'] == 0

fake_data = concated_df.loc[fake_label_filter]
true_data = concated_df.loc[true_label_filter]

print('True data total number:', len(true_data))
print('Fake data total number:', len(fake_data))

# bcz Fake's number less than true's.
# random sampling from True's data.
# set the ratio is 3:1
sampling_true_data = true_data.sample(
    n=len(fake_data) * 3,
    random_state=5487
  )

dataset = concat_two_dataframe(sampling_true_data, fake_data)
dataset.info()

True data total number: 15959
Fake data total number: 749
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2996 entries, 0 to 2995
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2996 non-null   object
 1   label   2996 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 46.9+ KB


In [None]:
texts = dataset['text'].values
labels= dataset['label'].values

In [None]:
# Token Embeddings
PRETRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [None]:
def encode_sentence_by_bert(sentence):
  encoded_sentence = tokenizer.encode(
        sentence, # Sentence to encode.
        add_special_tokens = True, # add [CLS] and [SEP]
        truncation = True,
        max_length = 100 
    )
  return encoded_sentence

In [None]:
input_ids = list()

for sentence in texts:
  encoded_sentence = encode_sentence_by_bert(sentence)
  input_ids.append(encoded_sentence)

print('Number of sentence:', len(input_ids))
print('Length of encode sentence:', len(input_ids[0]))

Number of sentence: 2996
Length of encode sentence: 100


In [None]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 100
input_ids = pad_sequences(
    input_ids, 
    maxlen= MAX_LEN, 
    dtype="long", 
    value=0, # padding by value 0
    truncating="post", # truncating from behind
    padding="post") # padding from behind

In [None]:
attention_masks = list()
for sentence in input_ids:
    att_mask = [int(token_id > 0) for token_id in sentence]
    attention_masks.append(att_mask)

In [None]:
inputs = torch.tensor(input_ids)
labels = torch.tensor(labels)
masks = torch.tensor(attention_masks)

In [None]:
batch_size = 8
# Create the DataLoader for our training set.
train_dataset = TensorDataset(inputs, masks, labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

In [None]:
config = BertConfig.from_pretrained(PRETRAINED_MODEL_NAME, output_hidden_states=True)
model = BertModel.from_pretrained(PRETRAINED_MODEL_NAME, config=config)
model.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
train_label = list()
train_set = list()

count=0
for step, batch in enumerate(train_dataloader):
  batch_tensor = tuple(tensr.to(device) for tensr in batch)
  b_input_ids, b_input_mask, b_labels = batch_tensor

  outputs = model(b_input_ids, b_input_mask)

  last_layer = outputs[0] # last layer
  label_ids = b_labels.to('cpu').numpy() # real label
  train_label.append(np.array(label_ids))

  for state in last_layer:
    # state size: (100, 768)
    allWordVecs = list()
    for word_unit in state:
      # size (768)
      allWordVecs.append(word_unit.detach().cpu().numpy())
    train_set.append(allWordVecs)

In [None]:
labels = list()
for i in train_label:
    for j in i:
        labels.append(j)
labels = np.array(labels)
print('Length of Labels:', len(labels))

Length of Labels: 2996


In [None]:
layer = np.array(train_set)
train_set = None
print(layer.shape)

(2996, 100, 768)


In [None]:
# split dataset 
X_train, X_test, y_train, y_test = \
    train_test_split(
        layer, 
        labels,
        test_size=0.3,
        random_state=0,
        stratify=labels
    )

X_train = X_train[:, 0, :]
X_test = X_test[:, 0, :]
print(X_train.shape)

model_svm = SVC(kernel='rbf')
model_svm.fit(X_train, y_train)
prediction = model_svm.predict(X_test)
accuracy = metrics.accuracy_score(y_test, prediction.round())

(2097, 768)


In [None]:
accuracy

0.8743047830923248