In [1]:
import argparse
from LoadData_nlp import *
from DataSet_nlp import *
from torch.utils.data import DataLoader
from TwitterModel_nlp import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_file = "../data/train_data.json"
train_label_file = "../data/project-data/train.label.txt"
dev_file = "../data/clean_dev_data.json"
dev_label_file =  "../data/project-data/dev.label.txt"
load_twitter_train_data = LoadDataAndProcessing(train_file,train_label_file)
load_twitter_dev_data = LoadDataAndProcessing(dev_file,dev_label_file)

In [3]:
train_input = load_twitter_train_data.loadData()
dev_input = load_twitter_dev_data.loadData()

In [4]:
##dev_input

In [5]:
trainset = TaskOneDataset(train_input)
devset = TaskOneDataset(dev_input)

In [6]:
trainloader = DataLoader(trainset, batch_size=2, collate_fn=trainset.create_mini_batch)
devloader = DataLoader(devset, batch_size=2, collate_fn=devset.create_mini_batch)

In [7]:
#for i in trainloader:
 #   print(i)

In [8]:
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-uncased"
NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
#model.parameters

In [10]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
    
model_params = get_learnable_params(model)
optimizer = torch.optim.Adam(model_params, lr=1.0e-5)

In [11]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    
    model.eval()  # 推論模式
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            outputs = model(*data[:3])
            # 前 3 個 tensors 分別為 tokens, segments 以及 masks
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            #print(pred)
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                #total += 2
                #print((pred == labels.squeeze()).sum().item())
                #print(labels)
                correct += (pred == labels.squeeze()).sum().item()
                #print(labels.size(0),(pred == labels).sum().item())
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)


device: cuda:0
classification acc: 0.6333973128598849


In [14]:
def get_prediction_from_logits(logits):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    return soft_probs.squeeze()

In [15]:
import time
start = time.time()

import matplotlib.pyplot as plt
train_acc_list = []
dev_acc_list = []
EPOCHS = 10
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad()
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()
        
        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, train_acc = get_predictions(model, trainloader, compute_acc=True)
    _, dev_acc = get_predictions(model, devloader, compute_acc=True)
    
    train_acc_list.append(train_acc)
    dev_acc_list.append(dev_acc)
    print(f"batch size:1")
    print(f'[training set : epoch {epoch+1}] loss: {running_loss}, acc: {train_acc}')
    print(f'[dev set      : epoch {epoch+1}] loss: {running_loss}, acc: {dev_acc}')
end = time.time()
print(f"time:{end-start:.2f}")

batch size:1
[training set : epoch 1] loss: 148.54447945812717, acc: 0.8886756238003839
[dev set      : epoch 1] loss: 148.54447945812717, acc: 0.8475247524752475
batch size:1
[training set : epoch 2] loss: 35.758926741895266, acc: 0.9225847728726807
[dev set      : epoch 2] loss: 35.758926741895266, acc: 0.8772277227722772
batch size:1
[training set : epoch 3] loss: 5.705868512595771, acc: 0.9315419065898912
[dev set      : epoch 3] loss: 5.705868512595771, acc: 0.8752475247524752
batch size:1
[training set : epoch 4] loss: 1.4932080246362602, acc: 0.9155470249520153
[dev set      : epoch 4] loss: 1.4932080246362602, acc: 0.8693069306930693
batch size:1
[training set : epoch 5] loss: 0.444606369943358, acc: 0.9360204734484965
[dev set      : epoch 5] loss: 0.444606369943358, acc: 0.8712871287128713
batch size:1
[training set : epoch 6] loss: 0.2393336309614824, acc: 0.9366602687140115
[dev set      : epoch 6] loss: 0.2393336309614824, acc: 0.8732673267326733
batch size:1
[training set

In [None]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    all_predictions, all_labels = [], []
    model.eval()  # 推論模式
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            outputs = model(*data[:3])
            # 前 3 個 tensors 分別為 tokens, segments 以及 masks
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            #print(logits.data.shape)
            if compute_acc:
                    predictions = pred.detach().cpu().numpy().reshape(-1).tolist()
                    
                    labels = data[3].cpu().numpy().reshape(-1).tolist()
                    #print(len(labels),len(predictions))
                    all_predictions.extend(predictions)
                    all_labels.extend(labels)
            else:
                predictions = pred.detach().cpu().numpy().reshape(-1).tolist()
                all_predictions.extend(predictions)

        if compute_acc:
            auc = accuracy_score(all_labels, all_predictions)
            return all_predictions, auc
        return all_predictions

# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

In [16]:
class LoadTestAndProcessing():
    def __init__(self,tweets_file_path,tweet_data):
        self.tweets_file_path = tweets_file_path
        self.tweet_data = tweet_data

    def textProcess(self,text):
        text = re.sub("@[\w]*", "", text)  # 去@
        text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text)  # 去URL
        text = re.sub('[\n\t-]', '', text)
        text = text.strip()
        return text
    def get_tweet_list(self,tweets_file_path,id_list):
        text_list = []
        for i in id_list:
            file_name = tweets_file_path + i + ".json"
            with open(file_name) as f:
                json_data = json.load(f)
            text_list.append(json_data)
        return text_list
    
    def loadData(self):
        all_ids = pd.read_table(tweet_data,header=None)
        data = []
        for i in range(len(all_ids)):
            list_id = list(all_ids.iloc[i])
            list_sub_id = list_id[0].split(",")
            text_list_sub =  self.get_tweet_list(tweets_file_path,list_sub_id)
            data.append(text_list_sub)
        #print(data[0])
        tweeter_source = []
        tweeter_replay = []
        tweeter_id = []
        text_p1 = ""
        text_p2 = ""
        no_source = 0
        for tweet_data_list in data:
            temp_source = ""
            temp_reply = []
            #print(tweet_data_list[0])
            source_tweet = tweet_data_list[0]
            source_num = 0
            for tweet in tweet_data_list:
                if  not tweet['in_reply_to_status_id']:
                    source_num = source_num + 1
            if  source_num == 0:
                text_p1 = self.textProcess(source_tweet['text']).lower()
                temp_source = temp_source + text_p1
                tweeter_id.append(source_tweet['id_str'])
                for tweet in tweet_data_list:
                        text_p2 = self.textProcess(tweet['text']).lower()
                        res_sim = difflib.SequenceMatcher(None, text_p1, text_p2).quick_ratio()
                        temp_reply.append(text_p2)
            elif  source_num > 1:
                for tweet in tweet_data_list:
                    if  not tweet['in_reply_to_status_id']:
                        text_p1 = self.textProcess(tweet['text']).lower()
                        temp_source = temp_source + text_p1
                        tweeter_id.append(tweet['id_str'])
                    else:
                        text_p2 = self.textProcess(tweet['text']).lower()
                        res_sim = difflib.SequenceMatcher(None, text_p1, text_p2).quick_ratio()
                        temp_reply.append(text_p2)
            else:
                for tweet in tweet_data_list:
                    if  not tweet['in_reply_to_status_id']:
                        text_p1 = self.textProcess(tweet['text']).lower()
                        temp_source = temp_source + text_p1
                    else:
                        text_p2 = self.textProcess(tweet['text']).lower()
                        res_sim = difflib.SequenceMatcher(None, text_p1, text_p2).quick_ratio()
                        temp_reply.append(text_p2)
            no_source = no_source +1    
            tweeter_source.append(temp_source)                          
            tweeter_replay.append(temp_reply)     

        total_replay = []
        for i in tweeter_replay:
            temp = ''
            for line in i:
                temp += line
            total_replay.append(temp)

        input_value = []
        for i in range(len(tweeter_source)):
            temp = {}
            temp['text'] = tweeter_source[i]
            temp['textb'] = total_replay[i]
            input_value.append(temp)

        return input_value,tweeter_id

In [17]:
import pandas as pd
import json
import re
import difflib

tweets_file_path = "../data/project-data/tweet-objects/tweet-objects/"
tweet_data = "../data/project-data/test.data.txt"
load_twitter_test_data= LoadTestAndProcessing(tweets_file_path,tweet_data)

test_input,twitter_id  = load_twitter_test_data.loadData()

In [None]:
#test_input

In [18]:
testset = TaskTestDataset(test_input)
testloader = DataLoader(testset, batch_size=1, collate_fn=testset.create_mini_batch)

In [None]:
for i in testset :
    print("token%%%%%%%%%%%%%%%%%%%%")
    print(i[0])
    print("se%%%%%%%%%%%%%%%%%%%%")
    print(i[1])


In [19]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    
    model.eval()  # 推論模式
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            outputs = model(*data[:3])
            # 前 3 個 tensors 分別為 tokens, segments 以及 masks
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
pred = get_predictions(model, testloader, compute_acc=False)
#print("classification acc:", acc)

device: cuda:0


In [20]:
pred_list = pred.tolist()

In [24]:
new_list = []
for i in pred_list:
    if i == 0:
        new_list.append(1)
    else:
        new_list.append(0)

In [25]:
import pandas as pd
test=pd.DataFrame(data=new_list)
print(test)
import csv
n = len(test)
nlist = range(0,n)
test['Id'] = nlist
test.columns = ['Predicted','Id']
test[['Id','Predicted']] = test[['Predicted','Id']]
test.columns = ['Id','Predicted']
test.to_csv('../data/test_nlp_v5.csv',encoding='gbk',index = False, quoting=csv.QUOTE_NONNUMERIC)

     0
0    0
1    1
2    0
3    0
4    0
..  ..
553  0
554  1
555  0
556  0
557  0

[558 rows x 1 columns]
