In [1]:
%load_ext Cython

In [2]:
!pip install line_profiler==3.3.1
%load_ext line_profiler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# ! unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

In [6]:
seed_everything(42)

42

In [7]:
MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [8]:
train= pd.read_csv('/content/drive/MyDrive/Disaster-Tweets-Prediction-main/train.csv')
test= pd.read_csv("/content/drive/MyDrive/Disaster-Tweets-Prediction-main/test.csv")

In [9]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

# Preprocessing without optimization

In [10]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens



In [11]:
# fix sentence length
def trunc_padding(sentence):
    modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

In [12]:
stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))

In [13]:
%lprun -f processing processing(train, stopwords, lemma)

<pre>
Timer unit: 1e-06 s

Total time: 9.03075 s
File: <ipython-input-12-0655b95bc61f>
Function: processing at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     4                                           def processing(df, stopwords, lemma):
     5         1      10659.0  10659.0      0.1      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
     6         1     124528.0 124528.0      1.4      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
     7                                               # tokenization
     8         1      13497.0  13497.0      0.1      df['text'] = df['text'].apply(lambda sentence: sentence.split())
     9                                               # remove stopwords
    10         1     310121.0 310121.0      3.4      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    11                                               # lemmalization 
    12         1    8515944.0 8515944.0     94.3      df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    13                                               # sentence length before padding
    14         1      16401.0  16401.0      0.2      df['length'] = df['text'].apply(lambda x: len(x))
    15                                               # fix sentence length
    16         1      30916.0  30916.0      0.3      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    17                                               # sentence length after padding
    18         1       8683.0   8683.0      0.1      df['length_padding'] = df['text'].apply(lambda x: len(x))

In [14]:
%lprun -f processing processing(test, stopwords, lemma)

<pre>
Timer unit: 1e-06 s

Total time: 1.45814 s
File: <ipython-input-12-0655b95bc61f>
Function: processing at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     4                                           def processing(df, stopwords, lemma):
     5         1       7101.0   7101.0      0.5      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
     6         1      70602.0  70602.0      4.8      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
     7                                               # tokenization
     8         1      22389.0  22389.0      1.5      df['text'] = df['text'].apply(lambda sentence: sentence.split())
     9                                               # remove stopwords
    10         1     203591.0 203591.0     14.0      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    11                                               # lemmalization 
    12         1    1107045.0 1107045.0     75.9      df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    13                                               # sentence length before padding
    14         1       4772.0   4772.0      0.3      df['length'] = df['text'].apply(lambda x: len(x))
    15                                               # fix sentence length
    16         1      32201.0  32201.0      2.2      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    17                                               # sentence length after padding
    18         1      10436.0  10436.0      0.7      df['length_padding'] = df['text'].apply(lambda x: len(x))

# Preprocessing with Cython optimization

In [15]:
train= pd.read_csv('/content/drive/MyDrive/Disaster-Tweets-Prediction-main/train.csv')
test= pd.read_csv("/content/drive/MyDrive/Disaster-Tweets-Prediction-main/test.csv")

In [16]:
%%cython --annotate
# cython: linetrace=True
# cython: binding=True
# distutils: define_macros=CYTHON_TRACE_NOGIL=1
# cython: profile=True
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")

cpdef seed_everything(int seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
cdef int MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
cpdef remove_emoji(str string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

cpdef clean_sentence(str sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

cpdef remove_stopwords(list tokens, list stopwords):
    cdef list clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
cdef lemmatize(list tokens, object lemma):
    cdef list lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

# fix sentence length
cpdef trunc_padding(list sentence):
    cdef list modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

cdef processing(object df, list stopwords, object lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
%reload_ext line_profiler
%lprun -f processing processing(train, stopwords, lemma)

<pre>
Timer unit: 1e-06 s

Total time: 3.28486 s
File: <ipython-input-12-0655b95bc61f>
Function: processing at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     4                                           def processing(df, stopwords, lemma):
     5         1      21043.0  21043.0      0.6      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
     6         1     300018.0 300018.0      9.1      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
     7                                               # tokenization
     8         1      63784.0  63784.0      1.9      df['text'] = df['text'].apply(lambda sentence: sentence.split())
     9                                               # remove stopwords
    10         1     407134.0 407134.0     12.4      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    11                                               # lemmalization 
    12         1    2454691.0 2454691.0     74.7      df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    13                                               # sentence length before padding
    14         1       7978.0   7978.0      0.2      df['length'] = df['text'].apply(lambda x: len(x))
    15                                               # fix sentence length
    16         1      21464.0  21464.0      0.7      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    17                                               # sentence length after padding
    18         1       8747.0   8747.0      0.3      df['length_padding'] = df['text'].apply(lambda x: len(x))

In [18]:
%reload_ext line_profiler
%lprun -f processing processing(test, stopwords, lemma)


<pre>Timer unit: 1e-06 s

Total time: 1.6507 s
File: <ipython-input-12-0655b95bc61f>
Function: processing at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     4                                           def processing(df, stopwords, lemma):
     5         1      10458.0  10458.0      0.6      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
     6         1      90952.0  90952.0      5.5      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
     7                                               # tokenization
     8         1      15359.0  15359.0      0.9      df['text'] = df['text'].apply(lambda sentence: sentence.split())
     9                                               # remove stopwords
    10         1     482046.0 482046.0     29.2      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    11                                               # lemmalization 
    12         1    1032163.0 1032163.0     62.5      df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    13                                               # sentence length before padding
    14         1       3638.0   3638.0      0.2      df['length'] = df['text'].apply(lambda x: len(x))
    15                                               # fix sentence length
    16         1      11533.0  11533.0      0.7      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    17                                               # sentence length after padding
    18         1       4556.0   4556.0      0.3      df['length_padding'] = df['text'].apply(lambda x: len(x))

# Comparison
processing training data without optimization takes:  9.03075 s <br>
processing training data with Cython optimization takes:  3.28486 s <br>
The speed up is : 9.03075 / 3.28486 ≈ 2.75 <br><br>
processing testing data without optimization takes:  1.45814 s s <br>
processing testing data with Cython optimization takes:  1.6507 s  s <br>
The speed up is : 1.45814 / 1.6507 ≈ 0.8834<br>
The possible reason for this is the size of test data is too small and using
Cython has overhead, so in this case directly processing it the python way is faster.

