In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
## Import Package
# standard library
import os
import csv
import sys
import argparse
import numpy as np
from multiprocessing import Pool

# optional library
# import jieba
import pandas as pd
from gensim.models import Word2Vec

# pytorch library
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence.")

In [2]:
type(doc)

spacy.tokens.doc.Doc

In [2]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

deEmojify('This dog 😂')

'This dog '

In [3]:
def train_test_split(train_comment, train_label):
    
    train_data = list(zip(train_comment, train_label))
#     random.shuffle(train_data)
    
    train_set = train_data[:12000]
    valid_set = train_data[12000:]
    
    return train_set, valid_set

In [4]:
## Model Construction
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_emb=True):
        super(LSTM_Net, self).__init__()
        # Create embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # Fix/Train embedding 
        self.embedding.weight.requires_grad = False if fix_emb else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid())
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x dimension(batch, seq_len, hidden_size)
        # Use LSTM last hidden state (maybe we can use more states)
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

## Evaluation Method
def evaluation(outputs, labels):
    #outputs => probability (float)
    #labels => labels
    outputs[outputs>=0.5] = 1
    outputs[outputs<0.5] = 0
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

## Training Procedure
def training(train, valid, model, device):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\n=== start training, parameter total:{}, trainable:{}'.format(total, trainable))
    model.train()
    batch_size, n_epoch = batch, epoch
    criterion = nn.BCELoss()
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = optim.Adam(model.parameters(), lr = lr)
    total_loss, total_acc, best_acc = 0, 0, 0
    for one_epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        # training set
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            correct = evaluation(outputs, labels)
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{} == {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	one_epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f} '.format(total_loss/t_batch, total_acc/t_batch*100))

        # validation set
        model.eval()
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long)
                labels = labels.to(device, dtype=torch.float)
                outputs = model(inputs)
                outputs = outputs.squeeze()
#                 print(outputs)
                loss = criterion(outputs, labels)
                correct = evaluation(outputs, labels)
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc > best_acc:
                best_acc = total_acc
                torch.save(model, "{}/ckpt_{:.3f}".format(model_dir,total_acc/v_batch*100))
                print('save model with acc {:.3f}'.format(total_acc/v_batch*100))
        model.train()


In [6]:
## Preprocess corpus data
class Preprocess():
    def __init__(self, data_dir, label_dir):
        # Load jieba library
#         jieba.load_userdict(args.jieba_lib)
        self.embed_dim = word_dim
        self.seq_len = seq_len
        self.wndw_size = wndw
        self.word_cnt = cnt
        self.save_name = 'word2vec'
        self.index2word = []
        self.word2index = {}
        self.vectors = []
        self.unk = "<UNK>"
        self.pad = "<PAD>"
        # Load corpus
        if data_dir!=None:
            # Read data
            dm = pd.read_csv(data_dir)            
            data = dm['comment']
#             print(data)
            # Tokenize with multiprocessing
            # List in list out with same order
            # Multiple workers
#             P = Pool(processes=4) 
#             data = P.map(self.tokenize, data)
#             P.close() #關閉進程池，阻止更多的任務提交到進程池Pool，待任務完成後，工作進程會退出
#             P.join() #等待工作線程的退出，必須在close()或terminate()之後使用，因被終止的進程需要被父進程調用wait（join等價於wait）,否則進程會成為僵屍進程
#             self.data = data
            
            #===tokenize self_method===
#             self.data = np.load('all_tokens.npy',allow_pickle=True)
            all_comment_tokens = []
            for comment in data:
                comment = deEmojify(comment)   
                token_list = self.tokenize(comment)
                all_comment_tokens.append(token_list)

            self.data = all_comment_tokens
        
        if label_dir!=None:
            # Read Label
            dm = pd.read_csv(label_dir)
            self.label = [int(i) for i in dm['label']]
            
    def tokenize(self, sentence):
        """ Use jieba to tokenize a sentence.
        Args:
            sentence (str): One string.
        Return:
            tokens (list of str): List of tokens in a sentence.
        """
        # TODO
        tokens = []
        for token in nlp(sentence):
            token = token.lemma_        # lemmatization (text normalize)
            token = str(token).lower()  # transform spacy to string
        
            lexeme = nlp.vocab[token] # remove stop words, check if stopwords
            # add new
            if lexeme.is_stop == False and not token.isdigit() and token not in ['$','//','v.','1/2','......','.....','....','--','1/3','him.why','.lol','.....','️','️-',':','.@user','=','-pron-',' ','  ','   ','#','!','?','...','..','.','"','/','@',"'",'’','%','&',';','-','(',')',',','+']:
                if len(token) > 1:
                    tokens.append(token) 
        
        return tokens
    
    def get_embedding(self, load=False):
        print("=== Get embedding")
        # Get Word2vec word embedding
        if load:
            embed = Word2Vec.load(self.save_name)
        else:
            embed = Word2Vec(self.data, size=self.embed_dim, window=self.wndw_size, min_count=self.word_cnt, iter=50, workers=8)
            embed.save(self.save_name)
        # Create word2index dictinonary
        # Create index2word list
        # Create word vector list
        for i, word in enumerate(embed.wv.vocab):
            print('=== get words #{}'.format(i+1), end='\r')
            #e.g. self.word2index['魯'] = 1 
            #e.g. self.index2word[1] = '魯'
            #e.g. self.vectors[1] = '魯' vector
            self.word2index[word] = len(self.word2index)
            self.index2word.append(word)
            self.vectors.append(embed[word])
        self.vectors = torch.tensor(self.vectors)
        # Add special tokens
        self.add_embedding(self.pad)
        self.add_embedding(self.unk)
        print("=== total words: {}".format(len(self.vectors)))
        return self.vectors
    
    def add_embedding(self, word):
        # Add random uniform vector
        vector = torch.empty(1, self.embed_dim)
        torch.nn.init.uniform_(vector)
        self.word2index[word] = len(self.word2index)
        self.index2word.append(word)
        self.vectors = torch.cat([self.vectors, vector], 0)

    def get_indices(self,test=False):
        # Transform each words to indices
        # e.g. if 機器=0,學習=1,好=2,玩=3 
        # [機器,學習,好,好,玩] => [0, 1, 2, 2,3]
        all_indices = []
        # Use tokenized data
        for i, sentence in enumerate(self.data):
            print('=== sentence count #{}'.format(i+1), end='\r')
            
            sentence_indices = []
            for word in sentence:
                # if word in word2index append word index into sentence_indices
                # if word not in word2index append unk index into sentence_indices
                # TODO
                if word in self.word2index:
                    sentence_indices.append(self.word2index[word])
                else:
                    sentence_indices.append(self.word2index['<UNK>'])

            
            # pad all sentence to fixed length
            sentence_indices = self.pad_to_len(sentence_indices, self.seq_len, self.word2index[self.pad])
            all_indices.append(sentence_indices)
            
        if test:
            return torch.LongTensor(all_indices)         
        else:
            return torch.LongTensor(all_indices), torch.LongTensor(self.label)    
            
    def pad_to_len(self, arr, padded_len, padding=0):
        """ 
        if len(arr) < padded_len, pad arr to padded_len with padding.
        If len(arr) > padded_len, truncate arr to padded_len.
        Example:
            pad_to_len([1, 2, 3], 5, 0) == [1, 2, 3, 0, 0]
            pad_to_len([1, 2, 3, 4, 5, 6], 5, 0) == [1, 2, 3, 4, 5]
        Args:
            arr (list): List of int.
            padded_len (int)
            padding (int): Integer used to pad.
        Return:
            arr (list): List of int with size padded_len.
        """
        # TODO
        if len(arr) < padded_len:
            arr.extend([0]*(padded_len-len(arr)))
            return arr
        elif len(arr) > padded_len:
            return arr[:padded_len]
        else:
            return arr

In [10]:
## Main function
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    preprocess = Preprocess(train_X, train_Y)
    print('===preprocess finish===\n')
    
    
    # Get word embedding vectors
    embedding = preprocess.get_embedding(load=False)
    print('===embedding===\n',embedding)
    
    # Get word indices
    data, label = preprocess.get_indices()
#     print('data,label:',data,label)
#     print(data[:10],label[:10])
    # Split train and validation set and create data loader
    #===TODO===
    
    
    train_set, valid_set = train_test_split(data, label)
    
    
#     train_dataset = hw3_dataset(train_set,transform)
    train_loader = DataLoader(train_set, batch_size=128, shuffle=True)

#     valid_dataset = hw3_dataset(valid_set,transform)
    valid_loader = DataLoader(valid_set, batch_size=128, shuffle=False)
    

#     train_loader = None
#     val_loader = None
    # Get model
    model = LSTM_Net(embedding, word_dim, hidden_dim, num_layers)
    model = model.to(device)

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
        
    # Start training
    training(train_loader, valid_loader, model, device) #===todo: see operation===
    
    

    return preprocess,embedding
    
## Argparse
if __name__ == "__main__":

    model_dir = 'data'
    train_X = 'data/train_x.csv'
    train_Y = 'data/train_y.csv'
    lr = 0.001
    batch = 128
    epoch = 100
    num_layers = 1
    seq_len = 30
    word_dim = 100 # word2vector size
    hidden_dim = 100
    wndw = 3
    cnt = 1 # word2vector min_count
    
    
    
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('model_dir', type=str, help='[Output] Your model checkpoint directory')
#     parser.add_argument('jieba_lib',type=str, help='[Input] Your jieba dict.txt.big')
    parser.add_argument('train_X',type=str, help='[Input] Your train_x.csv')
    parser.add_argument('train_Y',type=str, help='[Input] Your train_y.csv')

    parser.add_argument('--lr', default=0.001, type=float)
    parser.add_argument('--batch', default=128, type=int)
    parser.add_argument('--epoch', default=10, type=int)
    parser.add_argument('--num_layers', default=1, type=int)
    parser.add_argument('--seq_len', default=30, type=int)
    parser.add_argument('--word_dim', default=100, type=int)
    parser.add_argument('--hidden_dim', default=100, type=int)
    parser.add_argument('--wndw', default=3, type=int)
    parser.add_argument('--cnt', default=3, type=int)
    args = parser.parse_args()
    main(args)
    '''

    preprocess_object,global_embedding = main() # preprocess object

cuda
===preprocess finish===

=== Get embedding
=== get words #4628



=== get words #4629=== get words #4630=== get words #4631=== get words #4632=== get words #4633=== get words #4634=== get words #4635=== get words #4636=== get words #4637=== get words #4638=== get words #4639=== get words #4640=== get words #4641=== get words #4642=== get words #4643=== get words #4644=== get words #4645=== get words #4646=== get words #4647=== get words #4648=== get words #4649=== get words #4650=== get words #4651=== get words #4652=== get words #4653=== get words #4654=== get words #4655=== get words #4656=== get words #4657=== get words #4658=== get words #4659=== get words #4660=== get words #4661=== get words #4662=== get words #4663=== get words #4664=== get words #4665=== get words #4666=== get words #4667=== get words #4668=== get words #4669=== get words #4670=== get words #4671=== get words #4672=== get words #4673=== get words #4674=== get words #4675=== get words #4676=== get words #4677=== get words #4678

=== get words #8933=== get words #8934=== get words #8935=== get words #8936=== get words #8937=== get words #8938=== get words #8939=== get words #8940=== get words #8941=== get words #8942=== get words #8943=== get words #8944=== get words #8945=== get words #8946=== get words #8947=== get words #8948=== get words #8949=== get words #8950=== get words #8951=== get words #8952=== get words #8953=== get words #8954=== get words #8955=== get words #8956=== get words #8957=== get words #8958=== get words #8959=== get words #8960=== get words #8961=== get words #8962=== get words #8963=== get words #8964=== get words #8965=== get words #8966=== get words #8967=== get words #8968=== get words #8969=== get words #8970=== get words #8971=== get words #8972=== get words #8973=== get words #8974=== get words #8975=== get words #8976=== get words #8977=== get words #8978=== get words #8979=== get words #8980=== get words #8981=== get words #8982

=== get words #13588=== get words #13589=== get words #13590=== get words #13591=== get words #13592=== get words #13593=== get words #13594=== get words #13595=== get words #13596=== get words #13597=== get words #13598=== get words #13599=== get words #13600=== get words #13601=== get words #13602=== get words #13603=== get words #13604=== get words #13605=== get words #13606=== get words #13607=== get words #13608=== get words #13609=== get words #13610=== get words #13611=== get words #13612=== get words #13613=== get words #13614=== get words #13615=== get words #13616=== get words #13617=== get words #13618=== get words #13619=== get words #13620=== get words #13621=== get words #13622=== get words #13623=== get words #13624=== get words #13625=== get words #13626=== get words #13627=== get words #13628=== get words #13629=== get words #13630=== get words #13631=== get words #13632=== get words #13633=== get words #13634=== get words

=== total words: 16447
===embedding===
 tensor([[ 1.3176, -3.1597,  1.5045,  ..., -1.2440,  1.9542,  0.2700],
        [-0.5555, -0.2545, -0.0964,  ..., -0.0141, -0.3422,  0.0968],
        [ 0.0746, -0.3593,  0.2356,  ...,  0.4048, -0.0708,  0.0406],
        ...,
        [ 0.0791, -0.0138,  0.1861,  ..., -0.0136,  0.0309, -0.0891],
        [ 0.9534,  0.6612,  0.4687,  ...,  0.2758,  0.6176,  0.0306],
        [ 0.2353,  0.0128,  0.7774,  ...,  0.0223,  0.0676,  0.0795]])


=== sentence count #1=== sentence count #2=== sentence count #3=== sentence count #4=== sentence count #5=== sentence count #6=== sentence count #7=== sentence count #8=== sentence count #9=== sentence count #10=== sentence count #11=== sentence count #12=== sentence count #13=== sentence count #14=== sentence count #15=== sentence count #16=== sentence count #17=== sentence count #18=== sentence count #19=== sentence count #20=== sentence count #21=== sentence count #22=== sentence count #23=== sentence count #24=== sentence count #25=== sentence count #26=== sentence count #27=== sentence count #28=== sentence count #29=== sentence count #30=== sentence count #31=== sentence count #32=== sentence count #33=== sentence count #34=== sentence count #35=== sentence count #36=== sentence count #37=== sentence count #38=== sentence count #39=== sentence count #40=== sentence count #41=== sentence count #42=== sentence count #43=== sentence count #

=== sentence count #9348=== sentence count #9349=== sentence count #9350=== sentence count #9351=== sentence count #9352=== sentence count #9353=== sentence count #9354=== sentence count #9355=== sentence count #9356=== sentence count #9357=== sentence count #9358=== sentence count #9359=== sentence count #9360=== sentence count #9361=== sentence count #9362=== sentence count #9363=== sentence count #9364=== sentence count #9365=== sentence count #9366=== sentence count #9367=== sentence count #9368=== sentence count #9369=== sentence count #9370=== sentence count #9371=== sentence count #9372=== sentence count #9373=== sentence count #9374=== sentence count #9375=== sentence count #9376=== sentence count #9377=== sentence count #9378=== sentence count #9379=== sentence count #9380=== sentence count #9381=== sentence count #9382=== sentence count #9383=== sentence count #9384=== sentence count #9385=== sentence count #9386=== sentence count #9387


=== start training, parameter total:1725601, trainable:80901
[ Epoch1 == 94/94 ] loss:0.635 acc:49.219 
Train | Loss:0.64123 Acc: 66.431 
Valid | Loss:0.63184 Acc: 65.000 
save model with acc 65.000
[ Epoch2 == 24/94 ] loss:0.687 acc:61.719 

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


[ Epoch2 == 94/94 ] loss:0.622 acc:50.781 
Train | Loss:0.63459 Acc: 66.556 
Valid | Loss:0.61589 Acc: 65.000 
[ Epoch3 == 94/94 ] loss:0.620 acc:47.656 
Train | Loss:0.60524 Acc: 68.243 
Valid | Loss:0.56847 Acc: 69.375 
save model with acc 69.375
[ Epoch4 == 94/94 ] loss:0.603 acc:52.344 
Train | Loss:0.56802 Acc: 72.249 
Valid | Loss:0.54860 Acc: 71.953 
save model with acc 71.953
[ Epoch5 == 94/94 ] loss:0.505 acc:55.469 
Train | Loss:0.53705 Acc: 74.318 
Valid | Loss:0.53804 Acc: 72.812 
save model with acc 72.812
[ Epoch6 == 94/94 ] loss:0.473 acc:60.938 
Train | Loss:0.51389 Acc: 75.665 
Valid | Loss:0.53432 Acc: 72.969 
save model with acc 72.969
[ Epoch7 == 94/94 ] loss:0.536 acc:54.688 
Train | Loss:0.50369 Acc: 76.521 
Valid | Loss:0.52610 Acc: 73.125 
save model with acc 73.125
[ Epoch8 == 94/94 ] loss:0.497 acc:58.594 
Train | Loss:0.49211 Acc: 77.227 
Valid | Loss:0.52328 Acc: 73.594 
save model with acc 73.594
[ Epoch9 == 94/94 ] loss:0.381 acc:62.500 
Train | Loss:0.480

[ Epoch73 == 94/94 ] loss:0.228 acc:67.969  
Train | Loss:0.08449 Acc: 97.224 
Valid | Loss:1.31160 Acc: 66.875 
[ Epoch74 == 94/94 ] loss:0.078 acc:73.438  
Train | Loss:0.07939 Acc: 97.257 
Valid | Loss:1.56065 Acc: 65.938 
[ Epoch75 == 94/94 ] loss:0.137 acc:71.094  
Train | Loss:0.06831 Acc: 97.631 
Valid | Loss:1.59114 Acc: 67.500 
[ Epoch76 == 94/94 ] loss:0.096 acc:73.438  
Train | Loss:0.06442 Acc: 97.781 
Valid | Loss:1.58198 Acc: 67.891 
[ Epoch77 == 94/94 ] loss:0.092 acc:74.219  
Train | Loss:0.06855 Acc: 97.565 
Valid | Loss:1.52259 Acc: 66.797 
[ Epoch78 == 94/94 ] loss:0.089 acc:73.438  
Train | Loss:0.06902 Acc: 97.573 
Valid | Loss:1.73794 Acc: 64.453 
[ Epoch79 == 94/94 ] loss:0.042 acc:74.219  
Train | Loss:0.07164 Acc: 97.465 
Valid | Loss:1.55112 Acc: 66.484 
[ Epoch80 == 94/94 ] loss:0.159 acc:71.875  
Train | Loss:0.07001 Acc: 97.532 
Valid | Loss:1.40879 Acc: 65.938 
[ Epoch81 == 94/94 ] loss:0.118 acc:71.875  
Train | Loss:0.07074 Acc: 97.415 
Valid | Loss:1.67

In [8]:
#===find most similar word===
res = Word2Vec.load(preprocess_object.save_name).most_similar('trump')
for item in res: 
      print(item[0] + ':' + str(item[1]))

du:0.6380078196525574
takeastand:0.6009941697120667
desert:0.5972704887390137
croatian:0.5920993089675903
trumps:0.5760598182678223
hillary:0.572769284248352
undone:0.5706384181976318
beloved:0.5701844692230225
bitterly:0.5632200837135315
surprise:0.5582342147827148


  


In [186]:
#===save tokenized comments===
# import numpy as np
# np.save('all_training_tokens', preprocess_object.data)

In [9]:
#===test word2vecotr
# len(preprocess_object.data) #13240
for i in preprocess_object.data:
    print(i)

['trump', 'league', 'billionaire', 'negotiator', 'extraordinaire', 'jan', 'resister', 'ill', 'trump', 'maga', 'trump2020', 'url']
['@user', 'ask', 'tough', 'gun', 'control', 'law', 'work', 'chicago']
['@user', 'gosh', 'dangit', 'reasonable', 'replica', 'gun', 'control', 'craziness', 'happen']
['@user', '@user', '@user', 'mean', 'dramatic', 'promise', 'come', 'true', 'sure', 'follow', 'start', 'realize', 'crap']
['dead', 'mass', 'shooting', 'fifth', 'bank', 'building', 'downtown', 'cincinnati', 'deep', 'state', 'fake', 'news', 'crisis', 'actor', 'alert', 'gun', 'control', 'bullshit', "f'off", 'feinstein', 'obama', 'url']
['@user', 'thank', 'god', 'single', 'day', 'maga', 'kag']
['@user', 'bless', 'little', 'entitlement', 'self', 'watch', 'game']
['@user', 'definitely', 'man', 'god']
['@user', 'yes', 'infuriating']
['@user', 'sexual', 'assault', 'agree', 'feel', 'pick', 'pace', 'foot', 'turn', 'push', 'cock', 'big', 'toe', 'toe', 'lift', 'gland', 'mm']
['@user', '@user', '@user', '@user'

['@user', '@user', '@user', '@user', '@user', '@user', '@user', '@user', '@user', '@user', '@user', '@user', '@user', 'want', 'corrupt', 'amp', 'cushioney', 'jobs', 'thry', 'want', 'business', 'usual', '         ', "don't", 'think']
['@user', '@user', 'think', '@user', 'corrupt', 'biter', 'snake']
['@user', '@user', 'brandon', 'lewis', 'express', 'url']
['bizpac', 'review', 'news', '    ', 'priceless', 'moment', 'rachel', 'maddow', 'invoke', 'bill', 'clinton', 'ask', 'hillary', 'kavanaugh', 'process', 'url', 'maga', '2a', 'prolife']
['@user', 'know', 'fire', 'think', 'hammer', 'pant', 'walk', 'find', 'coochie', 'lol']
['@user', 'real', 'question', 'ted', 'cruz', 'drop', 'bomb', 'life', 'disqualify', 'public', 'office', 'grab', 'pussy', 'literally', 'amp', 'verbally', 'lot', 'offensive']
['@user', '@user', '@user', 'singular', 'wipe', 'number', 'lib', 'dem', 'council', 'north', 'bed', 'conservatives', 'lose', 'local', 'election', 'seat', 'try', 'turn', 'democratic', 'vote', 'brexit', 'u

['@user', '@user', '@user', 'truly', 'little', 'goddess']
['@user', 'democrats', 'thug']
['@user', 'people', 'actually', 'watch', 'emmys', 'low', 'intellect', 'liberal', 'crackhead']
['@user', '@user', 'poor', 'comparison', 'jay', 'literally', 'fuck', 'nas', 'baby', 'mom', 'leave', 'condom', 'daughter', 'baby', 'seat', 'cardi', 'nicki', 'fight']
['@user', 'good', 'antifa', 'violent', 'fascism']
['@user', '@user', 'want', 'corrupt', 'bitch', 'gitmo', 'birthday', 'url']
['soros', 'money', 'pocket', 'maga', 'url']
['@user', '@user', 'people', 'benefit', 'medical', 'research', 'hope', 'work', 'matt', 'maga']
['@user', '@user', '@user', 'nice']
['@user', 'california', 'gun', 'control', 'law', 'prevent']
['@user', 'ask', 'tho']
['@user', '@user', '@user', 'surprised', 'promote', 'antifa', 'violence', 'public', 'suddenly', 'accuse', 'violence', 'domestic', 'violence', 'etc']
['@user', '@user', 'good', 'people', 'bad', 'white', 'suprematist', 'people', 'antifa', 'nuance']
['@user', 'need', 'st

In [200]:
#===finish test code===

# use_gpu = torch.cuda.is_available()
# model = LSTM_Net(global_embedding, word_dim, hidden_dim, num_layers) # model architecture
# if use_gpu:
#     model.cuda()
    
#===load model===
# model.load_state_dict(torch.load('model_81_best.pth'))
# model.load_state_dict(torch.load('data/ckpt_68.389'))
model = torch.load('data/ckpt_72.656')
device = torch.device('cuda')
model = model.to(device)
model.eval()



LSTM_Net(
  (embedding): Embedding(16447, 100)
  (lstm): LSTM(100, 100, batch_first=True)
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=100, out_features=1, bias=True)
    (2): Sigmoid()
  )
)

In [201]:
#===Preprocess test data===
class Preprocess_test_data():
    def __init__(self, data_dir):
        # Load jieba library
#         jieba.load_userdict(args.jieba_lib)
        self.embed_dim = word_dim
        self.seq_len = seq_len
        self.wndw_size = wndw
        self.word_cnt = cnt
        self.save_name = 'word2vec'
        self.index2word = []
        self.word2index = {}
        self.vectors = []
        self.unk = "<UNK>"
        self.pad = "<PAD>"
        # Load corpus
        if data_dir!=None:
            # Read data
            dm = pd.read_csv(data_dir)            
            data = dm['comment']
#             print(data)
            # Tokenize with multiprocessing
            # List in list out with same order
            # Multiple workers
#             P = Pool(processes=4) 
#             data = P.map(self.tokenize, data)
#             P.close() #關閉進程池，阻止更多的任務提交到進程池Pool，待任務完成後，工作進程會退出
#             P.join() #等待工作線程的退出，必須在close()或terminate()之後使用，因被終止的進程需要被父進程調用wait（join等價於wait）,否則進程會成為僵屍進程
#             self.data = data
            
            #===tokenize self_method===
#             self.data = np.load('all_tokens.npy',allow_pickle=True)
            all_comment_tokens = []
            for comment in data:
                comment = deEmojify(comment)
                token_list = self.tokenize(comment)
                all_comment_tokens.append(token_list)

            self.data = all_comment_tokens
            

    def tokenize(self, sentence):
        """ Use jieba to tokenize a sentence.
        Args:
            sentence (str): One string.
        Return:
            tokens (list of str): List of tokens in a sentence.
        """
        # TODO
        tokens = []
        for token in nlp(sentence):
            token = token.lemma_ # lemmatization (text normalize)
            token = str(token).lower()      # transform spacy to string
            
            lexeme = nlp.vocab[token] # remove stop words, check if stopwords
            if lexeme.is_stop == False and not token.isdigit() and token not in ['//','v.','1/2','......','.....','....','--','1/3','him.why','.lol','.....','️','️-',':','.@user','=','-pron-',' ','  ','   ','#','!','?','...','..','.','"','/','@',"'",'’','%','&',';','-','(',')',',','+']:
#                 if len(token) > 1:
                tokens.append(token) 

        return tokens
    
    def get_embedding(self, load=True):
        print("=== Get embedding===")
        # Get Word2vec word embedding
        if load:
            embed = Word2Vec.load('test_data_word2vector')
        else:
            embed = Word2Vec(self.data, size=self.embed_dim, window=self.wndw_size, min_count=self.word_cnt, iter=16, workers=8)
            embed.save('test_data_word2vector')
        # Create word2index dictinonary
        # Create index2word list
        # Create word vector list
        for i, word in enumerate(embed.wv.vocab):
            print('=== get words #{}'.format(i+1), end='\r')
            #e.g. self.word2index['魯'] = 1 
            #e.g. self.index2word[1] = '魯'
            #e.g. self.vectors[1] = '魯' vector
            self.word2index[word] = len(self.word2index)
            self.index2word.append(word)
            self.vectors.append(embed[word])
        self.vectors = torch.tensor(self.vectors)
        # Add special tokens
        self.add_embedding(self.pad)
        self.add_embedding(self.unk)
        print("=== total words: {}".format(len(self.vectors)))
        return self.vectors

    def add_embedding(self, word):
        # Add random uniform vector
        vector = torch.empty(1, self.embed_dim)
        torch.nn.init.uniform_(vector)
        self.word2index[word] = len(self.word2index)
        self.index2word.append(word)
        self.vectors = torch.cat([self.vectors, vector], 0)

    def get_indices(self,test = True):
        # Transform each words to indices
        # e.g. if 機器=0,學習=1,好=2,玩=3 
        # [機器,學習,好,好,玩] => [0, 1, 2, 2,3]
        all_indices = []
        # Use tokenized data
        for i, sentence in enumerate(self.data):
            print('=== sentence count #{}'.format(i+1), end='\r')
            
            sentence_indices = []
            for word in sentence:
                # if word in word2index append word index into sentence_indices
                # if word not in word2index append unk index into sentence_indices
                # TODO
                if word in self.word2index:
                    sentence_indices.append(self.word2index[word])
                else:
                    sentence_indices.append(self.word2index['<UNK>'])

            
            # pad all sentence to fixed length
            sentence_indices = self.pad_to_len(sentence_indices, self.seq_len, self.word2index[self.pad])
            all_indices.append(sentence_indices)
            
        if test:
            return torch.LongTensor(all_indices)         
        else:
            return torch.LongTensor(all_indices), torch.LongTensor(self.label)    
            
    def pad_to_len(self, arr, padded_len, padding=0):
        """ 
        if len(arr) < padded_len, pad arr to padded_len with padding.
        If len(arr) > padded_len, truncate arr to padded_len.
        Example:
            pad_to_len([1, 2, 3], 5, 0) == [1, 2, 3, 0, 0]
            pad_to_len([1, 2, 3, 4, 5, 6], 5, 0) == [1, 2, 3, 4, 5]
        Args:
            arr (list): List of int.
            padded_len (int)
            padding (int): Integer used to pad.
        Return:
            arr (list): List of int with size padded_len.
        """
        # TODO
        if len(arr) < padded_len:
            arr.extend([0]*(padded_len-len(arr)))
            return arr
        elif len(arr) > padded_len:
            return arr[:padded_len]
        else:
            return arr

In [202]:
#===load test data===


preprocess = Preprocess_test_data('data/test_x.csv')
# print(preprocess)
    

# Get word embedding vectors
embedding = preprocess.get_embedding(load=False)
# print(embedding)

# Get word indices
data = preprocess.get_indices()
print('data:',data,len(data))


#===create data loader===


# train_set, valid_set = train_test_split(data, label)

# #     train_dataset = hw3_dataset(train_set,transform)
# train_loader = DataLoader(train_set, batch_size=128, shuffle=True)

# #     valid_dataset = hw3_dataset(valid_set,transform)
# valid_loader = DataLoader(valid_set, batch_size=128, shuffle=False)
    


# test_set = load_test_data('data/test_img/test_img/')

# test_set = load_test_data(sys.argv[1])
# test_dataset = hw3_test_dataset(test_set,transform)
test_loader = DataLoader(data, batch_size=128, shuffle=False)








=== Get embedding===
=== get words #4392



=== total words: 4394
=== sentence count #1=== sentence count #2=== sentence count #3=== sentence count #4=== sentence count #5=== sentence count #6=== sentence count #7=== sentence count #8=== sentence count #9=== sentence count #10=== sentence count #11=== sentence count #12=== sentence count #13=== sentence count #14=== sentence count #15=== sentence count #16=== sentence count #17=== sentence count #18=== sentence count #19=== sentence count #20=== sentence count #21=== sentence count #22=== sentence count #23=== sentence count #24=== sentence count #25=== sentence count #26=== sentence count #27=== sentence count #28=== sentence count #29=== sentence count #30=== sentence count #31=== sentence count #32=== sentence count #33=== sentence count #34=== sentence count #35=== sentence count #36=== sentence count #37=== sentence count #38=== sentence count #39=== sentence count #40=== sentence count #41=== sentence count #42=== sentence count #4

In [203]:
#===get prediction

prediction = []
pred_output = []
with torch.no_grad():

    for i, (inputs) in enumerate(test_loader):
        inputs = inputs.to(device, dtype=torch.long)
#         labels = labels.to(device, dtype=torch.float)
        outputs = model(inputs)
#         outputs = outputs.squeeze()
        print(len(outputs))
        pred_output.extend(outputs)
        _, pred_label = torch.max(outputs, 1)
#         print(len(pred_label))
        prediction.extend(pred_label)
    
#         loss = criterion(outputs, labels)
#         correct = evaluation(outputs, labels)
#         total_acc += (correct / batch_size)
#         total_loss += loss.item()

#     print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
#     if total_acc > best_acc:
#         best_acc = total_acc
#         torch.save(model, "{}/ckpt_{:.3f}".format(model_dir,total_acc/v_batch*100))
#         print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
# model.train()

128
128
128
128
128
128
92


In [204]:
result = []
for i in range(len(pred_output)):
    if pred_output[i].item() >= 0.5:
        result.append(1)
    else:
        result.append(0)
#     print(pred_output[i].item())
print(result[:30])

[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [205]:
#===============================
# 開啟輸出的 CSV 檔案
import csv
with open('ans_72.656.csv', 'w', newline='') as csvFile:
  # 建立 CSV 檔寫入器
    writer = csv.writer(csvFile)

  # 寫入標題
    writer.writerow(['id','label'])
    
  # 寫入資料
    for i, answer in zip(range(0,860), result): 
        writer.writerow([i, answer])
print('complete!')

complete!


In [134]:
#===transform token space type to str===

spacy_tokens_2_str = []
for comment in preprocess_object.data:
    spacy_tokens_2_str.append([str(token) for token in comment])
print(spacy_tokens_2_str)



In [24]:
preprocess_object.data

[['trump',
  'league',
  'billionaire',
  'negotiator',
  'extraordinaire',
  'Jan',
  'Resister',
  '🤔',
  'Trump',
  'maga',
  'Trump2020'],
 ['ask', 'tough', 'gun', 'control', 'law', 'work', 'chicago'],
 ['gosh',
  'dangit',
  'reasonable',
  'replica',
  'gun',
  'control',
  'craziness',
  'happen',
  '😉'],
 ['mean',
  'dramatic',
  'promise',
  'come',
  'true',
  'sure',
  'follow',
  'start',
  'realize',
  'crap'],
 ['dead',
  'mass',
  'shooting',
  'Fifth',
  'Bank',
  'building',
  'downtown',
  'Cincinnati',
  'deep',
  'State',
  'Fake',
  'News',
  'Crisis',
  'Actor',
  'Alert',
  'Gun',
  'Control',
  'bullshit',
  "f'off",
  'Feinstein',
  'Obama'],
 ['thank', 'God', 'single', 'day', 'MAGA', 'kag'],
 ['bless', 'little', 'entitlement', 'self', 'watch', 'game'],
 ['definitely', 'man', 'God'],
 ['yes', 'infuriating'],
 ['sexual',
  'assault',
  'agree',
  'feel',
  'pick',
  'pace',
  'foot',
  'turn',
  'push',
  'cock',
  'big',
  'toe',
  'toe',
  'lift',
  'gland',
 

In [200]:
# if '<UNK>' in preprocess_object.word2index:
#     print('true')


In [201]:
#===get_indices function test===
#def get_indices(self,test=False):
# Transform each words to indices
# e.g. if 機器=0,學習=1,好=2,玩=3 
# [機器,學習,好,好,玩] => [0, 1, 2, 2,3]

test=False
all_indices = []
# Use tokenized data
for i, sentence in enumerate(preprocess_object.data):
    print('=== sentence count #{}'.format(i+1), end='\r')

    sentence_indices = []
    for word in sentence:
        # if word in word2index append word index into sentence_indices
        # if word not in word2index append unk index into sentence_indices
        # TODO
        if word in preprocess_object.word2index:
            sentence_indices.append(preprocess_object.word2index[word])
        else:
            sentence_indices.append(preprocess_object.word2index['<UNK>'])
#     print(sentence_indices)

    # pad all sentence to fixed length
    
    sentence_indices = pad_to_len(sentence_indices, preprocess_object.seq_len, preprocess_object.word2index[preprocess_object.pad])
    all_indices.append(sentence_indices)

# print(all_indices)



if test:
    print(torch.LongTensor(all_indices))   
#     return torch.LongTensor(all_indices)         
else:
    print(torch.LongTensor(all_indices), torch.LongTensor(preprocess_object.label))
#     return torch.LongTensor(all_indices), torch.LongTensor(preprocess_object.label)   


=== sentence count #1=== sentence count #2=== sentence count #3=== sentence count #4=== sentence count #5=== sentence count #6=== sentence count #7=== sentence count #8=== sentence count #9=== sentence count #10=== sentence count #11=== sentence count #12=== sentence count #13=== sentence count #14=== sentence count #15=== sentence count #16=== sentence count #17=== sentence count #18=== sentence count #19=== sentence count #20=== sentence count #21=== sentence count #22=== sentence count #23=== sentence count #24=== sentence count #25=== sentence count #26=== sentence count #27=== sentence count #28=== sentence count #29=== sentence count #30=== sentence count #31=== sentence count #32=== sentence count #33=== sentence count #34=== sentence count #35=== sentence count #36=== sentence count #37=== sentence count #38=== sentence count #39=== sentence count #40=== sentence count #41=== sentence count #42=== sentence count #43=== sentence count #

=== sentence count #4266=== sentence count #4267=== sentence count #4268=== sentence count #4269=== sentence count #4270=== sentence count #4271=== sentence count #4272=== sentence count #4273=== sentence count #4274=== sentence count #4275=== sentence count #4276=== sentence count #4277=== sentence count #4278=== sentence count #4279=== sentence count #4280=== sentence count #4281=== sentence count #4282=== sentence count #4283=== sentence count #4284=== sentence count #4285=== sentence count #4286=== sentence count #4287=== sentence count #4288=== sentence count #4289=== sentence count #4290=== sentence count #4291=== sentence count #4292=== sentence count #4293=== sentence count #4294=== sentence count #4295=== sentence count #4296=== sentence count #4297=== sentence count #4298=== sentence count #4299=== sentence count #4300=== sentence count #4301=== sentence count #4302=== sentence count #4303=== sentence count #4304=== sentence count #4305

=== sentence count #8953=== sentence count #8954=== sentence count #8955=== sentence count #8956=== sentence count #8957=== sentence count #8958=== sentence count #8959=== sentence count #8960=== sentence count #8961=== sentence count #8962=== sentence count #8963=== sentence count #8964=== sentence count #8965=== sentence count #8966=== sentence count #8967=== sentence count #8968=== sentence count #8969=== sentence count #8970=== sentence count #8971=== sentence count #8972=== sentence count #8973=== sentence count #8974=== sentence count #8975=== sentence count #8976=== sentence count #8977=== sentence count #8978=== sentence count #8979=== sentence count #8980=== sentence count #8981=== sentence count #8982=== sentence count #8983=== sentence count #8984=== sentence count #8985=== sentence count #8986=== sentence count #8987=== sentence count #8988=== sentence count #8989=== sentence count #8990=== sentence count #8991=== sentence count #8992

tensor([[   0,    1,    2,  ...,   13,    9,   14],
        [  16,   17,   18,  ...,    0,    0,    0],
        [  16,   34, 7626,  ...,    0,    0,    0],
        ...,
        [  16,   16, 3137,  ...,    0,    0,    0],
        [  16, 1337,  576,  ..., 1457,   68, 2366],
        [  16,  187,  139,  ...,    0,    0,    0]]) tensor([0, 0, 0,  ..., 0, 1, 1])


In [190]:
def pad_to_len(arr, padded_len, padding=0):
    """ 
    if len(arr) < padded_len, pad arr to padded_len with padding.
    If len(arr) > padded_len, truncate arr to padded_len.
    Example:
        pad_to_len([1, 2, 3], 5, 0) == [1, 2, 3, 0, 0]
        pad_to_len([1, 2, 3, 4, 5, 6], 5, 0) == [1, 2, 3, 4, 5]
    Args:
        arr (list): List of int.
        padded_len (int)
        padding (int): Integer used to pad.
    Return:
        arr (list): List of int with size padded_len.
    """
    # TODO
    if len(arr) < padded_len:
        arr.extend([0]*(padded_len-len(arr)))
        return arr
    elif len(arr) > padded_len:
        return arr[:padded_len]
    else:
        return arr
    
    
    

In [162]:
# testarr = [1,1]
# testarr.extend([0]*10)
testarr[:3]

[1, 1, 0]

In [117]:

print("=== Get embedding===")
# Get Word2vec word embedding
load = True

if load:
    embed = Word2Vec.load(test.save_name) # load word2vector model
else:
    embed = Word2Vec(comment_2_str, size=test.embed_dim, window=test.wndw_size, min_count=test.word_cnt, iter=16, workers=8)
    embed.save(test.save_name) # save word2vector model
    print('save word2vector model',embed)
    
# Create word2index dictinonary
# Create index2word list
# Create word vector list
for i, word in enumerate(embed.wv.vocab):
    print('=== get words #{}'.format(i+1), end='\r')
    #e.g. test.word2index['魯'] = 1 
    #e.g. test.index2word[1] = '魯'
    #e.g. test.vectors[1] = '魯' vector
    test.word2index[word] = len(test.word2index)
    test.index2word.append(word)
    test.vectors.append(embed[word])
        
test.vectors = torch.tensor(test.vectors)
# Add special tokens
test.add_embedding(test.pad)
test.add_embedding(test.unk)

print("=== total words: {}".format(len(test.vectors)))
print(test.vectors)

=== Get embedding===
=== get words #1=== get words #2=== get words #3=== get words #4=== get words #5=== get words #6=== get words #7=== get words #8=== get words #9=== get words #10=== get words #11=== get words #12=== get words #13=== get words #14=== get words #15=== get words #16=== get words #17=== get words #18=== get words #19=== get words #20=== get words #21=== get words #22=== get words #23=== get words #24=== get words #25=== get words #26=== get words #27=== get words #28=== get words #29=== get words #30=== get words #31=== get words #32=== get words #33=== get words #34=== get words #35=== get words #36=== get words #37=== get words #38=== get words #39=== get words #40=== get words #41=== get words #42=== get words #43=== get words #44=== get words #45=== get words #46=== get words #47=== get words #48=== get words #49=== get words #50=== get words #51=== get words #52=== get words #53=== get words #54=== get words #5



=== get words #3477=== get words #3478=== get words #3479=== get words #3480=== get words #3481=== get words #3482=== get words #3483=== get words #3484=== get words #3485=== get words #3486=== get words #3487=== get words #3488=== get words #3489=== get words #3490=== get words #3491=== get words #3492=== get words #3493=== get words #3494=== get words #3495=== get words #3496=== get words #3497=== get words #3498=== get words #3499=== get words #3500=== get words #3501=== get words #3502=== get words #3503=== get words #3504=== get words #3505=== get words #3506=== get words #3507=== get words #3508=== get words #3509=== get words #3510=== get words #3511=== get words #3512=== get words #3513=== get words #3514=== get words #3515=== get words #3516=== get words #3517=== get words #3518=== get words #3519=== get words #3520=== get words #3521=== get words #3522=== get words #3523=== get words #3524=== get words #3525=== get words #3526

=== total words: 53377
tensor([[ 1.1421,  1.3184,  0.7643,  ...,  0.6333,  0.1408,  0.6814],
        [-0.0816,  1.0924, -0.0502,  ...,  0.7942,  0.4352, -0.3065],
        [ 0.8795,  1.6503, -0.1953,  ..., -1.9755, -0.7535,  0.3954],
        ...,
        [ 0.0596, -0.1722, -0.0529,  ..., -0.2195,  0.0097,  0.1034],
        [ 0.6344,  0.6148,  0.2815,  ...,  0.5621,  0.2532,  0.4599],
        [ 0.2246,  0.3435,  0.4203,  ...,  0.6584,  0.7291,  0.5905]])


In [105]:
#===find most similar word===
# res = Word2Vec.load(test.save_name).most_similar('Trump')
# for item in res: 
#       print(item[0] + ':' + str(item[1]))

Kavanaugh:0.7416951656341553
Obama:0.7115967273712158
trump:0.6636394262313843
Judge:0.6272079944610596
Hillary:0.6259853839874268
America:0.6091712713241577
Canada:0.5998584032058716
elected:0.5847217440605164
President:0.5690690279006958
Mueller:0.56206214427948


  """Entry point for launching an IPython kernel.


In [61]:
# all_tokens = np.load('all_tokens.npy',allow_pickle=True)
all_tokens

array([Trump... an Ive league billionaire and negotiator extraordinaire!! Jan... #Resister !! 🤔 I’ll take Trump!!! #MAGA #Trump2020 URL,
       @user Just ask them how does having some of the toughest gun control laws working out for chicago,
       @user Gosh dangit!!!    IF we only had reasonable replica gun control this craziness wouldn’t happen!!   😉,
       ..., @user @user Need help packing?,
       @user White liberals like you and your colored ilk are easily worse than bigots. You are nothing but a coward and victim-hood enabler who wants to keep my kind impoverished via welfare and identity politics. Thank God more people are seeing your evil for what it is,
       @user She is a liar], dtype=object)

In [49]:
#===save tokenized comments===
# import numpy as np
# np.save('all_tokens', test.data)