In [1]:
%load_ext Cython

In [2]:
!pip install line_profiler==3.3.1
%load_ext line_profiler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
! unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/

Archive:  /root/nltk_data/corpora/wordnet.zip
   creating: /root/nltk_data/corpora/wordnet/
  inflating: /root/nltk_data/corpora/wordnet/lexnames  
  inflating: /root/nltk_data/corpora/wordnet/data.verb  
  inflating: /root/nltk_data/corpora/wordnet/index.adv  
  inflating: /root/nltk_data/corpora/wordnet/adv.exc  
  inflating: /root/nltk_data/corpora/wordnet/index.verb  
  inflating: /root/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /root/nltk_data/corpora/wordnet/data.adj  
  inflating: /root/nltk_data/corpora/wordnet/index.adj  
  inflating: /root/nltk_data/corpora/wordnet/LICENSE  
  inflating: /root/nltk_data/corpora/wordnet/citation.bib  
  inflating: /root/nltk_data/corpora/wordnet/noun.exc  
  inflating: /root/nltk_data/corpora/wordnet/verb.exc  
  inflating: /root/nltk_data/corpora/wordnet/README  
  inflating: /root/nltk_data/corpora/wordnet/index.sense  
  inflating: /root/nltk_data/corpora/wordnet/data.noun  
  inflating: /root/nltk_data/corpora/wordnet/data.adv  


In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

In [5]:
seed_everything(42)

42

In [6]:
MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
train= pd.read_csv('/content/drive/MyDrive/Disaster-Tweets-Prediction-main/train.csv')
test= pd.read_csv("/content/drive/MyDrive/Disaster-Tweets-Prediction-main/test.csv")

In [8]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

# Preprocessing without optimization

In [9]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens



In [10]:
# fix sentence length
def trunc_padding(sentence):
    modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

In [11]:
stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))

In [12]:
%lprun -f processing processing(train, stopwords, lemma)

<pre>
Timer unit: 1e-06 s

Total time: 7.58081 s
File: <ipython-input-11-0655b95bc61f>
Function: processing at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     4                                           def processing(df, stopwords, lemma):
     5         1      10126.0  10126.0      0.1      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
     6         1     137293.0 137293.0      1.8      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
     7                                               # tokenization
     8         1      13126.0  13126.0      0.2      df['text'] = df['text'].apply(lambda sentence: sentence.split())
     9                                               # remove stopwords
    10         1     141971.0 141971.0      1.9      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    11                                               # lemmalization 
    12         1    7250607.0 7250607.0     95.6      df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    13                                               # sentence length before padding
    14         1       5602.0   5602.0      0.1      df['length'] = df['text'].apply(lambda x: len(x))
    15                                               # fix sentence length
    16         1      17704.0  17704.0      0.2      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    17                                               # sentence length after padding
    18         1       4379.0   4379.0      0.1      df['length_padding'] = df['text'].apply(lambda x: len(x))

In [13]:
%lprun -f processing processing(test, stopwords, lemma)

<pre>
Timer unit: 1e-06 s

Total time: 0.59442 s
File: <ipython-input-11-0655b95bc61f>
Function: processing at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     4                                           def processing(df, stopwords, lemma):
     5         1       5410.0   5410.0      0.9      df['text'] = df['text'].apply(lambda sentence: sentence.lower())
     6         1      52004.0  52004.0      8.7      df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
     7                                               # tokenization
     8         1       5951.0   5951.0      1.0      df['text'] = df['text'].apply(lambda sentence: sentence.split())
     9                                               # remove stopwords
    10         1      72079.0  72079.0     12.1      df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    11                                               # lemmalization 
    12         1     446276.0 446276.0     75.1      df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    13                                               # sentence length before padding
    14         1       2922.0   2922.0      0.5      df['length'] = df['text'].apply(lambda x: len(x))
    15                                               # fix sentence length
    16         1       6969.0   6969.0      1.2      df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    17                                               # sentence length after padding
    18         1       2809.0   2809.0      0.5      df['length_padding'] = df['text'].apply(lambda x: len(x))

# Preprocessing with Cython optimization

In [6]:
train= pd.read_csv('/content/drive/MyDrive/Disaster-Tweets-Prediction-main/train.csv')
test= pd.read_csv("/content/drive/MyDrive/Disaster-Tweets-Prediction-main/test.csv")

In [None]:
train['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [3]:
%%cython 
# cython: linetrace=True
# cython: binding=True
# distutils: define_macros=CYTHON_TRACE_NOGIL=1
# cython: profile=True
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import itertools
nltk.download('stopwords')
nltk.download("wordnet")
cdef seed_everything(int seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
cdef int MAX_LENGTH = 30
cdef object device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

cdef clean_sentence(object sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    cdef object emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    sentence = emoji_pattern.sub(r' ', sentence)

    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

cdef remove_stopwords(list tokens, object stopwords):
    cdef list clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
cdef lemmatize(list tokens, object lemma):
    cdef list lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

cdef trunc_padding(list sentence):
    sentence = list(itertools.islice(itertools.chain(sentence, itertools.repeat('0')), 0, MAX_LENGTH)) # fix sentence length
    return sentence

cdef object stopwords = nltk.corpus.stopwords.words('english')
cdef object lemma = WordNetLemmatizer()

def processing(df: pd.DataFrame, object stopwords, object lemma):
    df['text'] = df['text'].map(lambda sentence: sentence.lower())
    df['text'] = df['text'].map(lambda sentence: clean_sentence(sentence))
    # tokenization 123123
    df['text'] = df['text'].map(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].map(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].map(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].map(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].map(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].map(lambda x: len(x))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download("wordnet")

stopwords = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
%lprun -f processing processing(train, stopwords, lemma)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<pre>
Timer unit: 1e-06 s

Total time: 6.21616 s
File: /root/.cache/ipython/cython/_cython_magic_8cec306dd1bca0d781798f8b4263af96.pyx
Function: processing at line 76

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    76                                           def processing(df: pd.DataFrame, object stopwords, object lemma):
    77         1      28608.0  28608.0      0.5      df['text'] = df['text'].map(lambda sentence: sentence.lower())
    78         1     585561.0 585561.0      9.4      df['text'] = df['text'].map(lambda sentence: clean_sentence(sentence))
    79                                               # tokenization 123123
    80         1      17020.0  17020.0      0.3      df['text'] = df['text'].map(lambda sentence: sentence.split())
    81                                               # remove stopwords
    82         1      20294.0  20294.0      0.3      df['text'] = df['text'].map(lambda sentence: remove_stopwords(sentence, stopwords))
    83                                               # lemmalization 
    84         1    5542541.0 5542541.0     89.2      df['text'] = df['text'].map(lambda sentence: lemmatize(sentence, lemma))
    85                                               # sentence length before padding
    86         1       4007.0   4007.0      0.1      df['length'] = df['text'].map(lambda x: len(x))
    87                                               # fix sentence length
    88         1      14625.0  14625.0      0.2      df['text'] = df['text'].map(lambda sentence: trunc_padding(sentence))
    89                                               # sentence length after padding
    90         1       3506.0   3506.0      0.1      df['length_padding'] = df['text'].map(lambda x: len(x))

In [8]:
%reload_ext line_profiler
%lprun -f processing processing(test, stopwords, lemma)


<pre>Timer unit: 1e-06 s

Total time: 0.517537 s
File: /root/.cache/ipython/cython/_cython_magic_8cec306dd1bca0d781798f8b4263af96.pyx
Function: processing at line 76

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    76                                           def processing(df: pd.DataFrame, object stopwords, object lemma):
    77         1       4426.0   4426.0      0.9      df['text'] = df['text'].map(lambda sentence: sentence.lower())
    78         1      40446.0  40446.0      7.8      df['text'] = df['text'].map(lambda sentence: clean_sentence(sentence))
    79                                               # tokenization 123123
    80         1       5095.0   5095.0      1.0      df['text'] = df['text'].map(lambda sentence: sentence.split())
    81                                               # remove stopwords
    82         1       6172.0   6172.0      1.2      df['text'] = df['text'].map(lambda sentence: remove_stopwords(sentence, stopwords))
    83                                               # lemmalization 
    84         1     451740.0 451740.0     87.3      df['text'] = df['text'].map(lambda sentence: lemmatize(sentence, lemma))
    85                                               # sentence length before padding
    86         1       1992.0   1992.0      0.4      df['length'] = df['text'].map(lambda x: len(x))
    87                                               # fix sentence length
    88         1       5861.0   5861.0      1.1      df['text'] = df['text'].map(lambda sentence: trunc_padding(sentence))
    89                                               # sentence length after padding
    90         1       1805.0   1805.0      0.3      df['length_padding'] = df['text'].map(lambda x: len(x))

# Comparison
processing training data without optimization takes:  7.58081 s <br>
processing training data with Cython optimization takes:  6.21616 s <br>
The speed up is : 7.58081 / 6.21616 ≈ 1.22 <br>
The relative speed up is 22 %
<br><br>
processing testing data without optimization takes:  0.59442 s <br>
processing testing data with Cython optimization takes:  0.517537 s <br>
The speed up is : 0.59442 / 0.517537 ≈ 1.15<br>
The relative speed up is 15 %

