In [1]:
# Basic Python package
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string 
import re

In [2]:
# Load the TSV files
train_file_path = r'C:\Users\prits\Downloads\Data\ghc_train.tsv'
test_file_path = r'C:\Users\prits\Downloads\Data\ghc_test.tsv'

train_df = pd.read_csv(train_file_path, sep='\t')
test_df = pd.read_csv(test_file_path, sep='\t')

In [3]:
train_df.columns

Index(['text', 'hd', 'cv', 'vo'], dtype='object')

In [4]:
#Top 10 rows in our dataset
train_df.head(10)

Unnamed: 0,text,hd,cv,vo
0,He most likely converted to islam due to his n...,0,0,0
1,So Ford lied about being a psychologist. Recor...,0,0,0
2,Jobs. Education. Ending abuse of Nation. CA43.,0,0,0
3,"I share a lot of your values, & like many who ...",0,0,0
4,I am so ready to get back to blogging! www.ben...,0,0,0
5,taking a look at new opportunity called FX Pro...,0,0,0
6,Reflecting back when I was in school with Spec...,0,0,0
7,"Let's be honest everyone, last year there were...",0,0,0
8,2007 Nuke plant in Syria,0,0,0
9,NBC's Chuck Todd Thinks He's Figured It All Ou...,0,0,0


In [5]:
#finding duplicates
train_df.duplicated().sum()

72

In [6]:
# removing the duplicates and keeing just the first occurrence
train_df = train_df.drop_duplicates()

In [7]:
pip install pandas nltk


Note: you may need to restart the kernel to use updated packages.


# WHITESPACE TOKENISATION


Whitespace tokenization is a simple method of breaking down text into tokens based on whitespace characters such as spaces, tabs, and newlines. This method is straightforward and often used as a preliminary step in text processing tasks

In [29]:
#whitespacetokenisation
from nltk.tokenize import WhitespaceTokenizer

# Initialize the whitespace tokenizer
whitespace_tokenizer = WhitespaceTokenizer()
def  whitespace_tokenize(text):
    return whitespace_tokenizer.tokenize(text)
train_df['tokens'] = train_df['text'].apply(whitespace_tokenize)
train_df.head()


Unnamed: 0,text,hd,cv,vo,tokens
0,He most likely converted to islam due to his n...,0,0,0,"[He, most, likely, converted, to, islam, due, ..."
1,So Ford lied about being a psychologist. Recor...,0,0,0,"[So, Ford, lied, about, being, a, psychologist..."
2,Jobs. Education. Ending abuse of Nation. CA43.,0,0,0,"[Jobs., Education., Ending, abuse, of, Nation...."
3,"I share a lot of your values, & like many who ...",0,0,0,"[I, share, a, lot, of, your, values,, &, like,..."
4,I am so ready to get back to blogging! www.ben...,0,0,0,"[I, am, so, ready, to, get, back, to, blogging..."


# Punctuation Tokenisation

Punctuation tokenization, also known as punctuation-aware tokenization, is a method of splitting text into tokens while considering punctuation marks. Unlike simple whitespace tokenization, punctuation tokenization ensures that punctuation marks are treated as separate tokens rather than being attached to words

In [9]:
#punctutaion tokenisation
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer


# Initialize the RegexpTokenizer for punctuation tokenization
punctuation_tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')

# Define a function for punctuation tokenization
def punctuation_tokenize(text):
    return punctuation_tokenizer.tokenize(text)

# Apply the punctuation_tokenize function to the 'text' column
train_df['tokens'] = train_df['text'].apply(punctuation_tokenize)

# Display the text and tokens columns side by side
result_df = train_df[['text', 'tokens']]

print(result_df)

                                                    text  \
0      He most likely converted to islam due to his n...   
1      So Ford lied about being a psychologist. Recor...   
2         Jobs. Education. Ending abuse of Nation. CA43.   
3      I share a lot of your values, & like many who ...   
4      I am so ready to get back to blogging! www.ben...   
...                                                  ...   
22031  I'm a fan of western civilization, and one bed...   
22032  Or ... is she saying that Muslims don't know h...   
22033  Thank you to all my followers that follow me e...   
22034  Wednesday music. https://www.youtube.com/watch...   
22035  This is a really Big Surprise!  https://www.wn...   

                                                  tokens  
0      [He, most, likely, converted, to, islam, due, ...  
1      [So, Ford, lied, about, being, a, psychologist...  
2      [Jobs, ., Education, ., Ending, abuse, of, Nat...  
3      [I, share, a, lot, of, your, values,

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prits\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# SUBWORD TOKENISATION


Subword tokenization is a process where words are broken down into smaller units, which can help in handling out-of-vocabulary words and capturing subword-level information

In [23]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from tokenizers.processors import BertProcessing
import pandas as pd

# Assuming train_df is already defined and loaded
texts = train_df['text'].values

# Apply the mask to both texts and labels
texts = texts[mask]


# Define a function to train a subword tokenizer
def train_bpe_tokenizer(texts):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    
    trainer = trainers.BpeTrainer(vocab_size=5000, min_frequency=2, special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"])
    tokenizer.train_from_iterator(texts, trainer=trainer)
    
    tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    
    tokenizer.decoder = decoders.ByteLevel()
    
    return tokenizer

# Train the BPE tokenizer
tokenizer = train_bpe_tokenizer(texts)

# Tokenize texts using the trained tokenizer
def tokenize_texts(texts, tokenizer):
    tokenized_texts = [tokenizer.encode(text).tokens for text in texts]
    return tokenized_texts

tokenized_texts = tokenize_texts(texts, tokenizer)

print(tokenized_texts[:5])


[['<s>', 'He', 'most', 'likely', 'con', 'ver', 'ted', 'to', 'islam', 'due', 'to', 'his', 'nature', 'being', 'suit', 'able', 'for', 'islam', 'ic', 'doctr', 'ine', '.', '"', 'Pro', 'phe', 't', '"', 'Mu', 'ham', 'mad', 'was', 'a', 'psych', 'op', 'ath', '.', '</s>'], ['<s>', 'So', 'Ford', 'lied', 'about', 'being', 'a', 'psych', 'olog', 'ist', '.', 'Re', 'cord', 's', 'seem', 'to', 'ind', 'ic', 'ate', 'she', 'was', 'just', 'a', 'student', ',', 'no', 'work', '.', '</s>'], ['<s>', 'J', 'ob', 's', '.', 'E', 'du', 'cation', '.', 'En', 'ding', 'abuse', 'of', 'Nation', '.', 'CA', '43', '.', '</s>'], ['<s>', 'I', 'share', 'a', 'lot', 'of', 'your', 'values', ',', '&', 'like', 'many', 'who', 'do', ',', 'I', 'don', "'", 't', 'call', 'myself', 'alt', 'right', ';', 'I', "'", 'm', 'a', 'national', 'ist', ',', '&', 'not', 'civ', 'ic', '.', 'I', "'", 'd', 'always', 'thought', "'", 'alt', 'right', "'", 'is', 'an', 'um', 'bre', 'll', 'a', 'term', 'th', 'o', ',', 'where', 'many', 'are', 'really', 'alt', 'l', 

# CHARACTER TOKENISATION

Character tokenization is a method of breaking down text into individual characters, rather than words or subwords. This technique is particularly useful for languages with complex morphology, or in cases where handling out-of-vocabulary words is important. Character tokenization allows models to learn representations at the character level, which can capture finer-grained information and can be useful in various NLP tasks like language modeling, text generation, and spelling correction.

In [26]:
#character tokenisation
import pandas as pd

texts = train_df['text'].values

# Define a function to perform character tokenization
def character_tokenization(texts):
    tokenized_texts = [[char for char in text] for text in texts]
    return tokenized_texts
# Tokenize the texts
tokenized_texts = character_tokenization(texts)

# Example of tokenized texts
print(tokenized_texts[:5])

[['H', 'e', ' ', 'm', 'o', 's', 't', ' ', 'l', 'i', 'k', 'e', 'l', 'y', ' ', 'c', 'o', 'n', 'v', 'e', 'r', 't', 'e', 'd', ' ', 't', 'o', ' ', 'i', 's', 'l', 'a', 'm', ' ', 'd', 'u', 'e', ' ', 't', 'o', ' ', 'h', 'i', 's', ' ', 'n', 'a', 't', 'u', 'r', 'e', ' ', 'b', 'e', 'i', 'n', 'g', ' ', 's', 'u', 'i', 't', 'a', 'b', 'l', 'e', ' ', 'f', 'o', 'r', ' ', ' ', 'i', 's', 'l', 'a', 'm', 'i', 'c', ' ', 'd', 'o', 'c', 't', 'r', 'i', 'n', 'e', '.', ' ', '"', 'P', 'r', 'o', 'p', 'h', 'e', 't', '"', ' ', 'M', 'u', 'h', 'a', 'm', 'm', 'a', 'd', ' ', 'w', 'a', 's', ' ', 'a', ' ', 'p', 's', 'y', 'c', 'h', 'o', 'p', 'a', 't', 'h', '.'], ['S', 'o', ' ', 'F', 'o', 'r', 'd', ' ', 'l', 'i', 'e', 'd', ' ', 'a', 'b', 'o', 'u', 't', ' ', 'b', 'e', 'i', 'n', 'g', ' ', 'a', ' ', 'p', 's', 'y', 'c', 'h', 'o', 'l', 'o', 'g', 'i', 's', 't', '.', ' ', 'R', 'e', 'c', 'o', 'r', 'd', 's', ' ', 's', 'e', 'e', 'm', ' ', 't', 'o', ' ', 'i', 'n', 'd', 'i', 'c', 'a', 't', 'e', ' ', 's', 'h', 'e', ' ', 'w', 'a', 's', '