In [88]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [131]:
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /Users/yijun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yijun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [132]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [133]:
train= pd.read_csv('train.csv')
test= pd.read_csv("test.csv")

In [134]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

# proprocessing without optimization

In [135]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

# fix sentence length
def trunc_padding(sentence):
    modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))

In [136]:
%lprun -f processing processing(train, stopwords, lemma)

Timer unit: 1e-06 s

Total time: 2.91044 s
File: /var/folders/tp/8cvkqhbd14x93r7vzb73wjw40000gn/T/ipykernel_14986/3006866641.py
Function: processing at line 45

Line #      Hits         Time  Per Hit   % Time  Line Contents
    45                                           def processing(df, stopwords, lemma):
    46         1      12039.0  12039.0      0.4      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    47         1     274281.0 274281.0      9.4      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    48                                               # tokenization
    49         1      26956.0  26956.0      0.9      df['text'] = df['text'].apply(lambda sentence: sentence.split())
    50                                               # remove stopwords
    51         1     496703.0 496703.0     17.1      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    52                                          

In [137]:
%lprun -f processing processing(test, stopwords, lemma)

Timer unit: 1e-06 s

Total time: 1.03425 s
File: /var/folders/tp/8cvkqhbd14x93r7vzb73wjw40000gn/T/ipykernel_14986/3006866641.py
Function: processing at line 45

Line #      Hits         Time  Per Hit   % Time  Line Contents
    45                                           def processing(df, stopwords, lemma):
    46         1       4323.0   4323.0      0.4      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    47         1     121556.0 121556.0     11.8      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    48                                               # tokenization
    49         1      12452.0  12452.0      1.2      df['text'] = df['text'].apply(lambda sentence: sentence.split())
    50                                               # remove stopwords
    51         1     197353.0 197353.0     19.1      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    52                                          

# Preprocessing with itertools

Optimzation: 
1. Combine function clean_sentence,remove_stopwords,lemmatize to reduce function call overhead 
2. Use itertools to reduce memory usage:
    1) Using itertools.filterfalse to efficiently remove stopwords from the tokenized sentence
    2) Using itertools.chain.from_iterable and itertools.islice to efficiently fix sentence length by padding or truncating the sentence.

In [138]:
train= pd.read_csv('train.csv')
test= pd.read_csv("test.csv")

In [139]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import itertools

nltk.download('stopwords')
nltk.download('wordnet')


def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

stopwords = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

# combine func clean_sentence,remove_stopwords,lemmatize 
def clean_sentence(sentence): 
    sentence = re.sub(r'http\S+', ' ', sentence) # remove urls
    sentence = remove_emoji(sentence) # remove emojis
    sentence = re.sub("[^0-9A-Za-z ]", "", sentence) # remove punctuations
    sentence = sentence.replace('  ',"") # remove double spaces
    sentence = sentence.lower().split() # lowercase and tokenize
    sentence = itertools.filterfalse(lambda x: x in stopwords, sentence) # remove stopwords
    sentence = list(itertools.chain.from_iterable(map(lambda x: [lemma.lemmatize(x, pos='v')], sentence))) # lemmatize
    return sentence

#fix sentence length
def trunc_padding(sentence):
    sentence = itertools.islice(itertools.chain(sentence, itertools.repeat('0')), 0, MAX_LENGTH) # fix sentence length
    return list(sentence)


def processing(df):
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    df['length'] = df['text'].apply(lambda x: len(x))
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    df['length_padding'] = MAX_LENGTH



[nltk_data] Downloading package stopwords to /Users/yijun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yijun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [140]:
%lprun -f processing processing(train)

Timer unit: 1e-06 s

Total time: 2.31342 s
File: /var/folders/tp/8cvkqhbd14x93r7vzb73wjw40000gn/T/ipykernel_14986/1653265414.py
Function: processing at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                           def processing(df):
    40         1    2275029.0 2275029.0     98.3      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    41         1       6544.0   6544.0      0.3      df['length'] = df['text'].apply(lambda x: len(x))
    42         1      31165.0  31165.0      1.3      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    43         1        679.0    679.0      0.0      df['length_padding'] = MAX_LENGTH

In [141]:
%lprun -f processing processing(test)

Timer unit: 1e-06 s

Total time: 1.00814 s
File: /var/folders/tp/8cvkqhbd14x93r7vzb73wjw40000gn/T/ipykernel_14986/1653265414.py
Function: processing at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                           def processing(df):
    40         1     975215.0 975215.0     96.7      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    41         1       3575.0   3575.0      0.4      df['length'] = df['text'].apply(lambda x: len(x))
    42         1      28569.0  28569.0      2.8      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    43         1        776.0    776.0      0.1      df['length_padding'] = MAX_LENGTH

# Comparison
processing training data without optimization takes:  2.91044 s <br>
processing training data with itertool optimization takes:  2.31342 s <br>
The speed up is : 2.91044 / 2.31342 ≈ 1.26 <br><br>
processing testing data without optimization takes:  1.03425 s <br>
processing testing data with itertool optimization takes:  1.00814 s  <br>
The speed up is : 1.03425 / 1.00814 ≈ 1.025<br>
