In [64]:
import re
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
  }

def transform_to_dataset(tagged_sentences):
  X, y = [], []
  for sentence, tags in tagged_sentences:
    sent_word_features, sent_tags = [],[]
    for index in range(len(sentence)):
        sent_word_features.append(extract_features(sentence, index)),
        sent_tags.append(tags[index])
    X.append(sent_word_features)
    y.append(sent_tags)
  return X, y

In [65]:
#This cell loads the Penn Treebank corpus from nltk into a list variable named penn_treebank.

#No need to install nltk in google colab since it is preloaded in the environments.
#!pip install nltk
# import nltk
# nltk.download('treebank')

# #Ensure that the treebank corpus is downloaded

# #Load the treebank corpus class
# from nltk.corpus import treebank

# #Now we iterate over all samples from the corpus (the fileids - that are equivalent to sentences) 
# #and retrieve the word and the pre-labeled PoS tag. This will be added as a list of tuples with 
# #a list of words and a list of their respective PoS tags (in the same order).
# penn_treebank = []
# for fileid in treebank.fileids():
#   tokens = []
#   tags = []
#   for word, tag in treebank.tagged_words(fileid):
#     tokens.append(word)
#     tags.append(tag)
#   penn_treebank.append((tokens, tags))

In [66]:
from pathlib import Path
data_dir = Path("../../data/train_50M_multimodal_clean/")
paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".train"]]
print("Paths: ", paths)

Paths:  ['../../data/train_50M_multimodal_clean/open_subtitles.train', '../../data/train_50M_multimodal_clean/childes.train', '../../data/train_50M_multimodal_clean/cc_3M_captions_reduced.train', '../../data/train_50M_multimodal_clean/bnc_spoken.train', '../../data/train_50M_multimodal_clean/gutenberg.train', '../../data/train_50M_multimodal_clean/simple_wiki.train', '../../data/train_50M_multimodal_clean/switchboard.train', '../../data/train_50M_multimodal_clean/local_narr_captions.train']


In [67]:
from nltk.tag import pos_tag
from pathlib import Path

def process_text_file(file_path):
    words = []
    tags = []
    f = open(file_path, 'rb')
    n = sum(1 for _ in f)  # count the number of lines in the file
    print("Total lines in file: ", n)
    f.close()
    pattern = r"\b\w+(?:'\w+)?\b|\b\w+(?:-\w+)*\b|\d+(?:\.\d+)?|\S"  # Only consider the words.
    k = 0
    sentences_list = []
    with open(file_path, 'r') as file:
        for line in file:
            sentence = line.strip()

            # Split the sentence using the refined regex pattern
            tokens = re.findall(pattern, sentence)
            tagged_sentence = pos_tag(tokens)
            # print(tagged_sentence)
            
            for word, tag in tagged_sentence:
                words.append(word)
                tags.append(tag)
            k += 1
            sentences_list.append((words, tags))
            
            print("Completed line {0} out of {1}".format(k, n), end="\r")
    return sentences_list

In [69]:
# Example usage:
global_list = []
for file_path in paths:
    # file_path = paths[i]
    print("File path: ", file_path)
    result = process_text_file(file_path)
    global_list.extend(result)
    print("Global list: ",global_list)

# global_list_set = set(global_list)  # Remove duplicates

File path:  ../../data/train_50M_multimodal_clean/open_subtitles.train
Total lines in file:  1734740
Global list:  [(['I', 'cry', 'as', 'I', 'look', 'up', 'to', 'the', 'sky', 'Where', 'our', 'promise', 'shines', 'brightly', 'Try', '!', 'Comforting', 'words', 'are', 'like', 'breezes', 'that', "don't", 'reach', 'my', 'heart', 'My', 'misgivings', 'grow', 'But', 'I', 'make', 'my', 'own', 'decisions', 'so', 'I', 'will', 'probably', 'be', 'all', 'right', 'on', 'my', 'own', 'Even', 'if', 'I', 'stumble', ',', 'even', 'if', 'I', 'get', 'lost', ',', "there's", 'always', 'tomorrow', 'd', 'You', 'can', 'do', 'what', 'you', 'wanna', 'do', 'd', 'd', 'In', 'living', 'colord'], ['PRP', 'VBP', 'IN', 'PRP', 'VBP', 'RB', 'TO', 'DT', 'NN', 'WRB', 'PRP
, 'NN', 'NNS', 'RB', 'VB', '.', 'VBG', 'NNS', 'VBP', 'IN', 'NNS', 'WDT', 'VBP', 'VB', 'PRP
, 'NN', 'PRP
, 'NNS', 'VB', 'CC', 'PRP', 'VBP', 'PRP
, 'JJ', 'NNS', 'RB', 'PRP', 'MD', 'RB', 'VB', 'RB', 'RB', 'IN', 'PRP
, 'JJ', 'RB', 'IN', 'PRP', 'VBP', ',', 'RB', 

In [71]:
global_list[:2]

[(['I',
   'cry',
   'as',
   'I',
   'look',
   'up',
   'to',
   'the',
   'sky',
   'Where',
   'our',
   'promise',
   'shines',
   'brightly',
   'Try',
   '!',
   'Comforting',
   'words',
   'are',
   'like',
   'breezes',
   'that',
   "don't",
   'reach',
   'my',
   'heart',
   'My',
   'misgivings',
   'grow',
   'But',
   'I',
   'make',
   'my',
   'own',
   'decisions',
   'so',
   'I',
   'will',
   'probably',
   'be',
   'all',
   'right',
   'on',
   'my',
   'own',
   'Even',
   'if',
   'I',
   'stumble',
   ',',
   'even',
   'if',
   'I',
   'get',
   'lost',
   ',',
   "there's",
   'always',
   'tomorrow',
   'd',
   'You',
   'can',
   'do',
   'what',
   'you',
   'wanna',
   'do',
   'd',
   'd',
   'In',
   'living',
   'colord'],
  ['PRP',
   'VBP',
   'IN',
   'PRP',
   'VBP',
   'RB',
   'TO',
   'DT',
   'NN',
   'WRB',
   'PRP$',
   'NN',
   'NNS',
   'RB',
   'VB',
   '.',
   'VBG',
   'NNS',
   'VBP',
   'IN',
   'NNS',
   'WDT',
   'VBP',
   'VB',
  

In [74]:
import pandas as pd

df = pd.DataFrame(global_list, columns=["Word", "Tag"])
df.to_csv("pos_tagging_dataset_with_duplicates.csv", index=False, compression='gzip')

In [76]:
test = pd.read_csv("pos_tagging_dataset_with_duplicates_test.csv")

In [81]:
import ast
ast.literal_eval(test['Word'][0])[1]

'cry'

# Train a tagger using the saved training data.

In [1]:
import pandas as pd
import re
import nltk

In [2]:
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
  }

def transform_to_dataset(tagged_sentences):
  X, y = [], []
  for sentence, tags in tagged_sentences:
    sent_word_features, sent_tags = [],[]
    for index in range(len(sentence)):
        sent_word_features.append(extract_features(sentence, index)),
        sent_tags.append(tags[index])
    X.append(sent_word_features)
    y.append(sent_tags)
  return X, y

In [56]:
# Load the dataset.
with_duplicates = False
if with_duplicates:
    df = pd.read_csv("pos_tagging_dataset_with_duplicates.csv", compression='gzip')
else:
    df = pd.read_csv("pos_tagging_dataset_no_duplicates.csv", compression='gzip')

In [57]:
# testing, will delete this cell.
print(df["Tag"].value_counts())

Tag
NNP     206205
NN      121238
JJ       50827
NNS      37642
VB       23447
VBP      16526
VBN      15223
RB       13977
VBZ      13889
VBD      13429
CD       11658
VBG      10499
IN        5359
SYM       4021
NNPS      3278
FW        2490
JJR       1743
CC        1562
RP        1146
JJS       1010
PRP        878
PDT        771
MD         740
RBR        662
UH         639
DT         613
EX         372
$          357
WDT        338
WP         314
''         306
WRB        270
PRP$       248
RBS        169
LS         114
POS         52
TO          14
``           3
:            3
.            3
(            2
)            2
WP$          1
#            1
,            1
Name: count, dtype: int64


In [58]:
# Remove all the $ tags because they do NOT exist in the PENN Treebank dataset.
df = df[~df["Tag"].isin(["$"])]

In [59]:
# From the dataset, remove all the elements where the TAG is a punctuation.
searchfor = ['.', ',', '!', '?', ':', ';', '(', ')', '[', ']', '{', '}', '<', '>', '"', "'", "``", "''", "#"]
#  '#', '$', '&', '*', '+', '-', '/', '<', '=', '>', '@', '^', '_', '`', '|', '~',
df["TrueFalse"] = df['Tag'].apply(lambda x: 1 if any(i in x for i in searchfor) else 0)
# df_no_puncts = df[~df["Tag"].str.contains('|'.join(searchFor))]

In [60]:
# Remove all the rows where the TrueFalse column is 1.
df_no_puncts = df[df["TrueFalse"] == 0]

In [61]:
print(df_no_puncts["Tag"].value_counts())
len(df_no_puncts["Tag"].value_counts())  # The count must be 36 according to this link: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

Tag
NNP     206205
NN      121238
JJ       50827
NNS      37642
VB       23447
VBP      16526
VBN      15223
RB       13977
VBZ      13889
VBD      13429
CD       11658
VBG      10499
IN        5359
SYM       4021
NNPS      3278
FW        2490
JJR       1743
CC        1562
RP        1146
JJS       1010
PRP        878
PDT        771
MD         740
RBR        662
UH         639
DT         613
EX         372
WDT        338
WP         314
WRB        270
PRP$       248
RBS        169
LS         114
POS         52
TO          14
WP$          1
Name: count, dtype: int64


36

In [63]:
df_no_puncts.head()

Unnamed: 0,Word,Tag,TrueFalse
0,Kar'sene,NNP,0
1,subtracted,VBD,0
2,foolish,NN,0
3,llgotallythe,NN,0
4,comforted,JJ,0


In [62]:
# Create train test split.
train_size = int(0.8*len(df_no_puncts))
training = df_no_puncts[:train_size]
testing = df_no_puncts[train_size:]
X_train, y_train = transform_to_dataset(training)
X_test, y_test = transform_to_dataset(testing)

ValueError: too many values to unpack (expected 2)

In [19]:
pos_tag(['I','am','going','to','school', '.'])

[('I', 'PRP'),
 ('am', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('school', 'NN'),
 ('.', '.')]

In [32]:
# Refined regex pattern
text = "Here's an example sentence: with numbers 123 and punctuations, hyphens - and more! high-speed and it's 3.14 and example@example.com"

# Refined regex pattern
pattern = r"[A-Za-z0-9]+(?:'[A-Za-z]+)?|[A-Za-z]+(?:-[A-Za-z]+)*|[0-9]+(?:\.[0-9]+)?|[^\w\s]"

# Split the sentence using the refined regex pattern
tokens = re.findall(pattern, text)

In [40]:
features = [extract_features(tokens, i) for i in range(len(tokens))]

In [41]:
features[-2]

{'word': '.',
 'is_first': False,
 'is_last': False,
 'is_capitalized': True,
 'is_all_caps': True,
 'is_all_lower': True,
 'is_alphanumeric': 0,
 'prefix-1': '.',
 'prefix-2': '.',
 'prefix-3': '.',
 'suffix-1': '.',
 'suffix-2': '.',
 'suffix-3': '.',
 'prev_word': 'example',
 'next_word': '',
 'has_hyphen': False,
 'is_numeric': False,
 'capitals_inside': False,
 'punctuation': True}

In [33]:
tokens

["Here's",
 'an',
 'example',
 'sentence',
 ':',
 'with',
 'numbers',
 '123',
 'and',
 'punctuations',
 ',',
 'hyphens',
 '-',
 'and',
 'more',
 '!',
 'high',
 '-',
 'speed',
 'and',
 "it's",
 '3',
 '.',
 '14',
 'and',
 'example',
 '@',
 'example',
 '.',
 'com']