# Accessing Dataset & Project Checkpoint

### Applying Tokenization and Text Representation Methods

In [1]:
from functions_variables import *

# Variables
limit = 10  # 25000 for the full dataset
sample_size = 1

In [2]:
ds = load_dataset('imdb')
ds.shape

{'train': (25000, 2), 'test': (25000, 2), 'unsupervised': (50000, 2)}

In [3]:
# Limit the dataset samples
# for set_name in set_names:
#     ds[set_name] = ds[set_name].select(range(limit))
ds.shape

{'train': (25000, 2), 'test': (25000, 2), 'unsupervised': (50000, 2)}

In [4]:
# The split dataset - train, test and unsupervised
DatasetDict({set_name: ds[set_name] for set_name in set_names})

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
# Checking the first n rows of the train dataset
view_apply_function(ds)


train DataSet:
 {'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes 

### Basic Text Preprocessing (NLP)

In [6]:
# Removing HTML tags
view_apply_function(ds, sample_size, fun=remove_html_tags)


train DataSet:
 {'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.  The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.  What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betw

In [7]:
# Removing URLs
view_apply_function(ds, sample_size, fun=remove_urls)


train DataSet:
 {'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.  The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.  What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betw

In [8]:
# Removing punctuation
view_apply_function(ds, sample_size, fun=remove_punctuation)


train DataSet:
 {'text': ['I rented I AM CURIOUS YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967  I also heard that at first it was seized by U S  customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  I really had to see this for myself   The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life  In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States  In between asking politicians and ordinary denizens of Stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men   What kills me about I AM CURIOUS YELLOW is that 40 years ago  this was considered pornographic  Really  the sex and nudity scenes are few and far betw

In [9]:
# Chat words conversion
view_apply_function(ds, sample_size, fun=preprocess_chat_text)


train DataSet:
 {'text': ['I rented I AM CURIOUS YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967  I also heard that at first it was seized by U S  customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  I really had to see this for myself   The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life  In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States  In between asking politicians and ordinary denizens of Stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men   What kills me about I AM CURIOUS YELLOW is that 40 years ago  this was considered pornographic  Really  the sex and nudity scenes are few and far betw

In [10]:
# Removing stopwords
# view_apply_function(ds, sample_size, fun=remove_stopwords)

In [11]:
# Emoji removal
view_apply_function(ds, sample_size, fun=remove_emojis)


train DataSet:
 {'text': ['I rented I AM CURIOUS YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967  I also heard that at first it was seized by U S  customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  I really had to see this for myself   The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life  In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States  In between asking politicians and ordinary denizens of Stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men   What kills me about I AM CURIOUS YELLOW is that 40 years ago  this was considered pornographic  Really  the sex and nudity scenes are few and far betw

In [12]:
# Convert to lowercase
view_apply_function(ds, sample_size, fun=to_lower_case)


train DataSet:
 {'text': ['i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in 1967  i also heard that at first it was seized by u s  customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  i really had to see this for myself   the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life  in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states  in between asking politicians and ordinary denizens of stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men   what kills me about i am curious yellow is that 40 years ago  this was considered pornographic  really  the sex and nudity scenes are few and far betw

In [13]:
# Spellings correction
# view_apply_function(ds, sample_size, fun=correct_spelling)

In [14]:
# Tokenization
# Apply the tokenizer using `map`
# view_apply_function(ds, sample_size, tokenize_text)

In [15]:
# Tokenize text using regular expressions
# view_apply_function(ds, sample_size, tokenize_regex)

In [16]:
# Save the preprocessed data
path = '../data/preprocessed/'
for set_name in set_names:
    ds[set_name].to_csv(path + set_name + '.csv', index=False)
    print(f'Dataset saved successfully to \'{path + set_name + '.csv'}\'')

Creating CSV from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Dataset saved successfully to '../data/preprocessed/train.csv'


Creating CSV from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Dataset saved successfully to '../data/preprocessed/test.csv'


Creating CSV from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Dataset saved successfully to '../data/preprocessed/unsupervised.csv'


In [17]:
# Load the data from the file
# ds = load_dataset(file_out)
# print(f'Dataset loaded successfully from \'{file_out}\'')

ds.shape

{'train': (25000, 2), 'test': (25000, 2), 'unsupervised': (50000, 2)}

In [18]:
# # Stemming
# stemmer = PorterStemmer()
# print(stemmer.stem('running'))
# print(stemmer.stem('easily'))

In [19]:
# # Lemmatization
# lemmatizer = WordNetLemmatizer()
# print(lemmatizer.lemmatize('running', pos='v'))
# print(lemmatizer.lemmatize('better', pos='a'))

In [20]:
# # Sentence segmentation
# text = 'This is a sentence. This is another sentence.'
# sentences = nltk.sent_tokenize(text)
# print(sentences)