In [None]:
# 参考 https://github.com/AnthonyK97/Text-Classification-on-IMDB
# https://github.com/AnthonyK97/Text-Classification-on-IMDB/blob/main/2%20CNN%2BGlove.ipynb

import os
import sys
import random
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import OrderedDict
import re, string
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True
    
!mkdir ./model_bakup/

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CFG:
    batch_size = 20
    lr = 0.01
    eval_step_num = 300
    best_eval_acc = 0.0
    model_output_dir = './model_bakup/'
    seed = 2032
    
global_start_t = time.time()
print('ok')

In [None]:
seed_everything(seed=42)

imdb_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
print('before drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.drop_duplicates()
print('after drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(25000)
print('after sample, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(len(imdb_data)).reset_index(drop=True)  # shuffle

imdb_data.head(5)

In [None]:
MAX_WORDS = 10000   # 仅考虑最高频的10000个词
MAX_LEN = 200
word_count_dict = {}

def clean_text(text):
    lowercase = text.lower().replace('\n', ' ')
    stripped_html = re.sub('<br />', ' ', lowercase)
    cleaned_punctuation = re.sub('[%s]'%re.escape(string.punctuation), '', stripped_html)
    return cleaned_punctuation

for review in imdb_data['review'].values:
    cleaned_text = clean_text(review)
    for word in cleaned_text.split(' '):
        word_count_dict[word] = word_count_dict.get(word, 0) + 1
            
df_word_dict = pd.DataFrame(pd.Series(word_count_dict, name='count'))
df_word_dict = df_word_dict.sort_values(by='count', ascending=False)

df_word_dict = df_word_dict[:MAX_WORDS-2]     # 总共取前max_words-2个词
df_word_dict['word_id'] = range(2, MAX_WORDS)

word_id_dict = df_word_dict['word_id'].to_dict()
word_id_dict['<unknown>'] = 0
word_id_dict['<padding>'] = 1

df_word_dict.head(15)

In [None]:
word_count_dict = {}

def clean_text(text):
    lowercase = text.lower().replace('\n', ' ')
    stripped_html = re.sub('<br />', ' ', lowercase)
    cleaned_punctuation = re.sub('[%s]'%re.escape(string.punctuation), '', stripped_html)
    return cleaned_punctuation

for review in imdb_data['review'].values:
    cleaned_text = clean_text(review)
    for word in cleaned_text.split(' '):
        word_count_dict[word] = word_count_dict.get(word, 0) + 1
            
df_word_dict = pd.DataFrame(pd.Series(word_count_dict, name='count'))
df_word_dict = df_word_dict.sort_values(by='count', ascending=False)

df_word_dict = df_word_dict[:MAX_WORDS-2] # 总共取前max_words-2个词
df_word_dict['word_id'] = range(2, MAX_WORDS)

word_id_dict = df_word_dict['word_id'].to_dict()
word_id_dict['<unknown>'] = 0
word_id_dict['<padding>'] = 1

df_word_dict.head(15)

In [None]:
def pad(data_list, pad_length):
    padded_list = data_list.copy()
    
    if len(data_list) > pad_length:
        padded_list = data_list[-pad_length:]
        
    if len(data_list) < pad_length:
        padded_list = [1] * (pad_length-len(data_list)) + data_list
        
    return padded_list

def text_to_token(text):
    cleaned_text = clean_text(text)
    word_token_list = [word_id_dict.get(word, 0) for word in cleaned_text.split(' ')]
    pad_list = pad(word_token_list, MAX_LEN)
    token = ' '.join([str(x) for x in pad_list])
    return token
            
process_start_t = time.time()
print('start processing...')
imdb_data['review_tokens'] = imdb_data['review'].map(text_to_token)
print('ok, cost time: ', time.time()-process_start_t)
imdb_data.head(5)

In [None]:
imdb_data_train = imdb_data.iloc[:15000]
imdb_data_valid = imdb_data.iloc[15000:20000]
imdb_data_test = imdb_data.iloc[20000:]
print(f'imdb_data_train.shape: {imdb_data_train.shape}, imdb_data_valid.shape: {imdb_data_valid.shape}, '
      f'imdb_data_test.shape: {imdb_data_test.shape}')

In [None]:
glove_path = '/kaggle/input/glove6b/glove.6B.100d.txt'

cnt = 0
word_2_vector_map = {}
with open(glove_path) as fin:
    for line in fin:
        line = line.strip()
        word = line.split()[0]
        vector = np.array([float(val) for val in line.split()[1:]])
        word_2_vector_map[word] = vector
        cnt += 1
print('cnt is', cnt, 'len of word_2_vector_map: ', len(word_2_vector_map))

embed_size = 100
glove6b_100d_weight = torch.zeros(len(word_id_dict), embed_size)

for word, idx in word_id_dict.items():
    try:
        vector = word_2_vector_map[word]
    except:
        print('not found in : ', word)
        continue
    glove6b_100d_weight[idx, :] = torch.from_numpy(vector)
    
print('glove6b_100d_weight.shape: ', glove6b_100d_weight.shape)

In [None]:
cfg = CFG()
seed_everything(seed=cfg.seed)

print('ok')

In [None]:
class imdbDataset(Dataset):
    def __init__(self, data_df):
        self.data_df = data_df
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, index):
        label = self.data_df.iloc[index]['sentiment']
        label = torch.tensor([float(label)], dtype=torch.float, device=device)
        
        tokens = self.data_df.iloc[index]['review_tokens']
        feature = torch.tensor([int(x) for x in tokens.split(' ')], dtype=torch.long, device=device)
            
        return feature, label
    
def generate_data_iter(cfg):
    global imdb_data_train, imdb_data_valid, imdb_data_test
    ds_train = imdbDataset(imdb_data_train)
    ds_valid = imdbDataset(imdb_data_valid)
    ds_test = imdbDataset(imdb_data_test)
    print('len of ds_train: ', len(ds_train), 'len of ds_valid: ', len(ds_valid),
          'len of ds_test: ', len(ds_test))

    dl_train = DataLoader(ds_train, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
    dl_valid = DataLoader(ds_valid, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    dl_test = DataLoader(ds_test, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    return dl_train, dl_valid, dl_test

dl_train, dl_valid, dl_test = generate_data_iter(cfg)
print('ok')

In [None]:
class CNN_Net(nn.Module):
    def __init__(self):
        global glove6b_100d_weight
        super().__init__()
        
        #self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=3, padding_idx=1)
        self.embedding = nn.Embedding.from_pretrained(glove6b_100d_weight, freeze=False)
        assert self.embedding.weight.requires_grad==True, 'should be True, because freeze=False'
        
        self.conv = nn.Sequential()
        self.conv.add_module('conv_1', nn.Conv1d(in_channels=100, out_channels=16, kernel_size=5))
        self.conv.add_module('pool_1', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_1', nn.ReLU())
        self.conv.add_module('conv_2', nn.Conv1d(in_channels=16, out_channels=128, kernel_size=2))
        self.conv.add_module('pool_2', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_2', nn.ReLU())
        
        self.dense = nn.Sequential()
        self.dense.add_module('flatten', nn.Flatten())
        self.dense.add_module('linear', nn.Linear(6144, 1))
        self.dense.add_module('sigmoid', nn.Sigmoid())
        
    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = self.conv(x)
        y = self.dense(x)
        return y
    
model = CNN_Net()
print(model)
model.to(device)     

print('ok')

In [None]:
def accuracy(y_pred, y_true):
    if type(y_pred)==list:
        y_pred = np.array(y_pred)
    y_pred = (y_pred > 0.5)
    if type(y_true)==list:
        y_true = np.array(y_true)
    acc = (y_pred==y_true).mean()
    return acc

def evaluate(model, dl_test, device):
    global cfg
    model.eval()
    
    y_true_lst, y_pred_lst = [], []
    with torch.no_grad():
        for step, batch in tqdm(enumerate(dl_test)):
            feature, label = batch
            feature, label = feature.to(device), label.to(device)
            y_pred = model(feature)
            y_pred_lst += list(y_pred.detach().cpu().numpy())
            y_true_lst += list(label.detach().cpu().numpy())
            
    model.train() # 恢复模型为训练状态
    acc = accuracy(y_pred_lst, y_true_lst)

    return acc
    
def train(model, dl_train, optimizer, loss_func, device):
    global cfg, global_step_num
    model.train()  # 将模型置为训练状态
    
    y_true_lst, y_pred_lst = [], []
    #pbar = ProgressBar(n_total=len(dl_train), desc='Training')
    
    for step, batch in tqdm(enumerate(dl_train)):
        global_step_num += 1
        feature, label = batch
        feature, label = feature.to(device), label.to(device)
        #print('in train(), feature.shape:', feature.shape, 'label.shape: ', label.shape)
        y_pred = model(feature)
        train_loss = loss_func(y_pred, label)
        y_pred_lst += list(y_pred.detach().cpu().numpy())
        y_true_lst += list(label.detach().cpu().numpy())
        train_loss.backward()
        optimizer.step()
        model.zero_grad()
        
    print('in train(), len(dl_train): ', len(dl_train))
        
    acc = accuracy(y_pred_lst, y_true_lst)
    return acc

print('ok')

In [None]:
global_best_train_acc, global_best_valid_acc = 0.0, 0.0
global_train_acc = 0.0
global_step_num = 0

epochs = 25
# optimizer=torch.optim.Adagrad(model.parameters(), lr=cfg.lr)
optimizer=torch.optim.Adam(model.parameters(), lr=cfg.lr)
loss_func = nn.BCELoss()

for epoch in range(epochs):
    train_acc = train(model, dl_train, optimizer, loss_func, device)
    valid_acc = evaluate(model, dl_valid, device)
    test_acc = evaluate(model, dl_test, device)
    print(f'in epoch: {epoch}, train_acc: {train_acc:.5f}, valid_acc: {valid_acc:.5f}, test_acc: {test_acc:.5f}')
    if train_acc > global_best_train_acc:
        global_best_train_acc = train_acc
    if valid_acc > global_best_valid_acc:
        global_best_valid_acc = valid_acc
        global_train_acc = train_acc
        print(f'get new best_valid_acc: {global_best_valid_acc:.5f}, save the model now!')
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))

In [None]:
model = CNN_Net()
model.to(device)

model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_step_model.pth')))
test_acc = evaluate(model, dl_test, device)
print(f'final test_acc: {test_acc:.5f}, best_val_acc: {global_best_valid_acc:.5f}, '
      f'train_acc: {global_train_acc:.5f}, best_train_acc: {global_best_train_acc:.5f}')

print('total finished, cost time: ', time.time() - global_start_t)