See links for instructions on installation if not already installed.
 - [pandas](https://pypi.org/project/pandas/) 1.3.3
 - [nltk](https://www.nltk.org/install.html) 3.6.1
 - [spacy](https://spacy.io/usage) 2.2.2
    (and download the pipeline with `en_core_web_sm`)
 - [tokenizers](https://pypi.org/project/tokenizers/) 0.10.3
 

## Tokenization

In [1]:
import pandas as pd

corpus = pd.read_csv('dataset/IRAhandle_tweets_1.csv')['content'].tolist()

In [2]:
from collections import Counter

def print_stats(tokens):
    '''
    Prints out tokenization stats of the corpus
    
    Input:
        tokens (list): a list of all tokens in the entire corpus
    
    Output:
        - number of types (e.g., vocab size)
        - number of tokens
        - type/token ratio of all tweets
        - top 10 tokens in terms of frequency in the vocabulary
    '''
    
    num_type = len(set(tokens))
    num_token = len(tokens)
    ratio = num_type/num_token
    
    counter = Counter()
    for token in tokens:
        counter[token] += 1
            
    top_tokens = sorted(counter, key = counter.get, reverse = True)[:10]
    
    return num_type, num_token, ratio, top_tokens

def tokenize_texts(corpus, tokenize_func, single_func):
    '''
    Input:
        corpus (list): a list containing the contents of the IRAhandle_tweets_1.csv
        tokenize_func (function): the function to be used for tokenizing the corpus
        single_func (function): tokenization function for individual tweets
        
    Output:
        - prints tokenization status using print_stats()
        - prints tokenization results of the first 3 tweets of the corpus
    '''
    
    tokens = tokenize_func(corpus)
    
    stats = print_stats(tokens)
    print("The number of types is ", stats[0])
    print("The number of tokens is ", stats[1])
    print("The type/token ratio is ", stats[2])
    print("The top 10 tokens in terms of frequency are ", stats[3])
    print("\n")
    
    print("The tokenization results of the first tweet are:\n", single_func(corpus[0].lower()), "\n")
    print("The tokenization results of the second tweet are:\n", single_func(corpus[1].lower()), "\n")
    print("The tokenization results of the second tweet are:\n", single_func(corpus[2].lower()), "\n")


In [3]:
def tokenize_simple(text):
    '''
    A simple tokenizer
    
    Input:
        text (string): tweets from the corpus
    
    Output: a list of tokens
    '''
    
    split_list = []
    
    for tweet in text:
        split_list += tweet.split()
    
    return [token.lower() for token in split_list]

def single_simple(text):
    '''
    A simple tokenizer for individual tweets
    
    Input:
        text (string): tweet from the corpus
        
    Output: a list of tokens
    '''
    
    return text.split()

print('Tokenize by space:\n')
tokenize_texts(corpus, tokenize_simple, single_simple)


Tokenize by space:

The number of types is  554069
The number of tokens is  3267403
The type/token ratio is  0.16957473565397352
The top 10 tokens in terms of frequency are  ['the', 'to', 'a', 'in', 'of', 'is', 'for', 'and', 'в', 'rt']


The tokenization results of the first tweet are:
 ['"we', 'have', 'a', 'sitting', 'democrat', 'us', 'senator', 'on', 'trial', 'for', 'corruption', 'and', "you've", 'barely', 'heard', 'a', 'peep', 'from', 'the', 'mainstream', 'media."', '~', '@nedryun', 'https://t.co/gh6g0d1oic'] 

The tokenization results of the second tweet are:
 ['marshawn', 'lynch', 'arrives', 'to', 'game', 'in', 'anti-trump', 'shirt.', 'judging', 'by', 'his', 'sagging', 'pants', 'the', 'shirt', 'should', 'say', 'lynch', 'vs.', 'belt', 'https://t.co/mlh1i30lzz'] 

The tokenization results of the second tweet are:
 ['daughter', 'of', 'fallen', 'navy', 'sailor', 'delivers', 'powerful', 'monologue', 'on', 'anthem', 'protests,', 'burns', 'her', 'nfl', 'packers', 'gear.', '#boycottnfl', 

### NLTK tokenization

In [4]:
# TODO: write a short script, along with any necessary functions, to implement the NLTK tokenizer.
# You can follow the example in 1.1, but you will need to think about the input parameters to the tokenizer function
# You may find partial in functools a useful thing to use
from nltk.tokenize import WordPunctTokenizer
from functools import partial

def tokenize_nltk_wpt(text):
    '''
    A tokenizer using the tokenize method of nltk WordPunctTokenizer
    
    Input:
        text (string): tweets from the corpus
    
    Output: a list of tokens
    '''
    
    tk = WordPunctTokenizer()
    split_list = []
    
    for tweet in text:
        split_list += tk.tokenize(tweet)

    return [token.lower() for token in split_list]

def wpt_single(text):
    '''
    An nltk tokenizer for individual tweets
    
    Input:
        text (string): tweet from the corpus
        
    Output: a list of tokens
    '''
    
    tk = WordPunctTokenizer()
    
    return tk.tokenize(text)

print("Tokenize with WordPunctTokenizer:\n")
tokenize_texts(corpus, tokenize_nltk_wpt, wpt_single)


Tokenize with WordPunctTokenizer:

The number of types is  417701
The number of tokens is  5605127
The type/token ratio is  0.0745212374313731
The top 10 tokens in terms of frequency are  ['.', 't', '/', 'co', '://', 'https', '#', 'the', ',', "'"]


The tokenization results of the first tweet are:
 ['"', 'we', 'have', 'a', 'sitting', 'democrat', 'us', 'senator', 'on', 'trial', 'for', 'corruption', 'and', 'you', "'", 've', 'barely', 'heard', 'a', 'peep', 'from', 'the', 'mainstream', 'media', '."', '~', '@', 'nedryun', 'https', '://', 't', '.', 'co', '/', 'gh6g0d1oic'] 

The tokenization results of the second tweet are:
 ['marshawn', 'lynch', 'arrives', 'to', 'game', 'in', 'anti', '-', 'trump', 'shirt', '.', 'judging', 'by', 'his', 'sagging', 'pants', 'the', 'shirt', 'should', 'say', 'lynch', 'vs', '.', 'belt', 'https', '://', 't', '.', 'co', '/', 'mlh1i30lzz'] 

The tokenization results of the second tweet are:
 ['daughter', 'of', 'fallen', 'navy', 'sailor', 'delivers', 'powerful', 'mon

In [5]:
from nltk.tokenize import TreebankWordTokenizer

def tokenize_nltk_twt(text):
    '''
    A tokenizer using the tokenize method of nltk TreebankWordTokenizer
    
    Input:
        text (string): tweets from the corpus
    
    Output: a list of tokens
    '''
    
    tk = TreebankWordTokenizer()
    split_list = []
    
    for tweet in text:
        split_list += tk.tokenize(tweet)

    return [token.lower() for token in split_list]

def twt_single(text):
    '''
    An nltk tokenizer for individual tweets
    
    Input:
        text (string): tweet from the corpus
        
    Output: a list of tokens
    '''
    
    tk = TreebankWordTokenizer()
    
    return tk.tokenize(text)

print("Tokenize with TreebankWordTokenizer:\n")
tokenize_texts(corpus, tokenize_nltk_twt, twt_single)


Tokenize with TreebankWordTokenizer:

The number of types is  464905
The number of tokens is  4367567
The type/token ratio is  0.10644484675335261
The top 10 tokens in terms of frequency are  [':', 'https', '#', 'the', ',', '@', 'to', '!', 'a', 'in']


The tokenization results of the first tweet are:
 ['``', 'we', 'have', 'a', 'sitting', 'democrat', 'us', 'senator', 'on', 'trial', 'for', 'corruption', 'and', 'you', "'ve", 'barely', 'heard', 'a', 'peep', 'from', 'the', 'mainstream', 'media.', "''", '~', '@', 'nedryun', 'https', ':', '//t.co/gh6g0d1oic'] 

The tokenization results of the second tweet are:
 ['marshawn', 'lynch', 'arrives', 'to', 'game', 'in', 'anti-trump', 'shirt.', 'judging', 'by', 'his', 'sagging', 'pants', 'the', 'shirt', 'should', 'say', 'lynch', 'vs.', 'belt', 'https', ':', '//t.co/mlh1i30lzz'] 

The tokenization results of the second tweet are:
 ['daughter', 'of', 'fallen', 'navy', 'sailor', 'delivers', 'powerful', 'monologue', 'on', 'anthem', 'protests', ',', 'burn

### spaCy tokenization

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])

def tokenize_spacy(text):
    '''
    A tokenizer using the spaCy tokenization toolkit
    
    Input:
        text (string): tweets from the corpus
    
    Output: a list of tokens
    '''
    
    split_list = []
    
    for tweet in text:
        doc = nlp(tweet)
        split_list += [token.text for token in doc]
    
    return [token.lower() for token in split_list]

def spacy_single(text):
    '''
    A tokenizer using the spaCy tokenization toolkit for individual tweets
    
    Input:
        text (string): tweet from the corpus
    
    Output: a list of tokens
    '''
    
    doc = nlp(text)
    
    return [token.text for token in doc]

print("Tokenize with spaCy:\n")
tokenize_texts(corpus, tokenize_spacy, spacy_single)


Tokenize with spaCy:

The number of types is  427416
The number of tokens is  4062629
The type/token ratio is  0.10520675158868802
The top 10 tokens in terms of frequency are  ['#', '.', 'the', ',', ':', 'to', ' ', '!', 'a', 'in']


The tokenization results of the first tweet are:
 ['"', 'we', 'have', 'a', 'sitting', 'democrat', 'us', 'senator', 'on', 'trial', 'for', 'corruption', 'and', 'you', "'ve", 'barely', 'heard', 'a', 'peep', 'from', 'the', 'mainstream', 'media', '.', '"', '~', '@nedryun', 'https://t.co/gh6g0d1oic'] 

The tokenization results of the second tweet are:
 ['marshawn', 'lynch', 'arrives', 'to', 'game', 'in', 'anti', '-', 'trump', 'shirt', '.', 'judging', 'by', 'his', 'sagging', 'pants', 'the', 'shirt', 'should', 'say', 'lynch', 'vs.', 'belt', 'https://t.co/mlh1i30lzz'] 

The tokenization results of the second tweet are:
 ['daughter', 'of', 'fallen', 'navy', 'sailor', 'delivers', 'powerful', 'monologue', 'on', 'anthem', 'protests', ',', 'burns', 'her', 'nfl', 'packers

### BPE model

In [7]:
train_data = pd.read_csv('dataset/IRAhandle_tweets_1.csv', usecols = ['content'])
train_data = train_data['content'].str.lower()
train_data.to_csv('dataset/IRA_BPE_train.csv', index = False)

In [8]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files = ['dataset/IRA_BPE_train.csv'], trainer = trainer)







In [9]:
def tokenize_bpe(text):
    '''
    A tokenizer that implements the BPE tokenizer
    
    Input:
        text (string): tweets from the corpus
        
    Output: a list of tokens
    '''
    
    split_list = []
    
    for tweet in text:
        output = tokenizer.encode(tweet)
        split_list += output.tokens
    
    return [token.lower() for token in split_list]

def bpe_single(text):
    '''
    A tokenizer that implements the BPE tokenizer for individual tweets
    
    Input:
        text (string): tweet from the corpus
        
    Output: a list of tokens
    '''
    
    output = tokenizer.encode(text)
    return output.tokens

print("Tokenize with BPE:\n")
tokenize_texts(corpus, tokenize_bpe, bpe_single)


Tokenize with BPE:

The number of types is  27739
The number of tokens is  7308449
The type/token ratio is  0.0037954701469490996
The top 10 tokens in terms of frequency are  ['.', 't', '/', 'co', '://', 'https', '#', ',', "'", ':']


The tokenization results of the first tweet are:
 ['"', 'we', 'have', 'a', 'sitting', 'democrat', 'us', 'senator', 'on', 'trial', 'for', 'corruption', 'and', 'you', "'", 've', 'barely', 'heard', 'a', 'peep', 'from', 'the', 'mainstream', 'media', '."', '~', '@', 'ned', 'ry', 'un', 'https', '://', 't', '.', 'co', '/', 'gh', '6', 'g0', 'd1', 'o', 'ic'] 

The tokenization results of the second tweet are:
 ['mar', 'shawn', 'lynch', 'arrives', 'to', 'game', 'in', 'anti', '-', 'trump', 'shirt', '.', 'judging', 'by', 'his', 'sag', 'ging', 'pants', 'the', 'shirt', 'should', 'say', 'lynch', 'vs', '.', 'belt', 'https', '://', 't', '.', 'co', '/', 'ml', 'h1', 'i', '30', 'l', 'zz'] 

The tokenization results of the second tweet are:
 ['daughter', 'of', 'fallen', 'navy