In [1]:
import nltk
import string
import re
import numpy as np
import pandas as pd
from contractions import contractions_dict
from pattern.en import tag
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
wnl = WordNetLemmatizer()

In [3]:
stopwords_list = nltk.corpus.stopwords.words('english')

In [4]:
data = pd.read_csv('train.csv')

In [5]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
data.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [7]:
data.drop(['id','keyword','location'],axis = 1,inplace = True)

In [8]:
data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
def removed_url(text):
    """function for removed url"""
    url_pattern = re.compile(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*')
    filtered_text = re.sub(url_pattern,'',text)
    return filtered_text

In [10]:
removed_url_text = data['text'].apply(lambda x:removed_url(x))

In [11]:
def removed_html(text):
    """function for removed html"""
    html_pattern = re.compile(r'<.*?>')
    filtered_text = re.sub(html_pattern,'',text)
    return filtered_text

In [12]:
removed_html_text = removed_url_text.apply(lambda x:removed_html(x))

In [13]:
def removed_unwanted_characters(text):
    """function for removing forward_slash_followed_by_string between the text"""
    backslash_pattern = re.compile(r'(\r)+(\n)*(\r)+(\n)*($)(\w)+|(\r)+(\n)*(\r)+(\n)*(#)(\w)+|(#)(\w)+|(&)(\w)+|(@)(\w)+|($)(\w)+|[?/(\r)+(\n)*(#)*(_)*(\d)+]+')
    filtered_text = re.sub(backslash_pattern,' ', text)
    return filtered_text

In [14]:
removed_unwanted_characters_text = removed_html_text.apply(lambda x:removed_unwanted_characters(x))

In [15]:
def tokenized_text(text):
    """function for tokenized tweets text"""
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [16]:
def expand_contraction(text,contractions_dict):
    """function for removed the contraction words"""
    contraction_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                    flags = re.IGNORECASE|re.DOTALL)
    """function for expand match"""
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expand_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        
        #expanded_contraction = first_char + contraction[1:]
        return expand_contraction
    
    expanded_text = contraction_pattern.sub(expand_match,text)
    expanded_text = re.sub("'","",expanded_text)
    return expanded_text

In [17]:
expand_contraction_text = removed_unwanted_characters_text.apply(lambda x:expand_contraction(x,contractions_dict))

In [18]:
def removed_special_characters(text):
    """function for removed special characters from the text"""
    tokens = tokenized_text(text)
    special_characters_pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filter_tokens = filter(None,[special_characters_pattern.sub('',token) for token in tokens])
    filtered_text = ' '.join(filter_tokens)
    return filtered_text

In [19]:
removed_special_characters_text = expand_contraction_text.apply(lambda x:removed_special_characters(x))

In [20]:
def removed_unicode(text):
    """function for removed unicode from the text"""
    unicode_pattern = re.compile(r'\w+[^\x00-\x7F]\w+|[^\x00-\x7F]')
    filtered_text = re.sub(unicode_pattern,'',text)
    return filtered_text

In [21]:
removed_unicode_text = removed_special_characters_text.apply(lambda x:removed_unicode(x))

In [22]:
def removed_stopwords(text):
    """function for removed stopwords from the text"""
    tokens = tokenized_text(text)
    filtered_text = [token for token in tokens if token not in stopwords_list]
    filtered_text = ' '.join(filtered_text)
    return filtered_text

In [23]:
removed_stopwords_text = removed_unicode_text.apply(lambda x:removed_stopwords(x))

In [24]:
def removed_unwanted_string(text):
    """function for removed two lenght string from text"""
    string_pattern = re.compile(r'\b(\w{1})\b|\b(\w{2})\b|\b(\w{3})\b')
    filered_text = re.sub(string_pattern,'',text)
    return filered_text

In [25]:
removed_unwanted_string_text = removed_stopwords_text.apply(lambda x:removed_unwanted_string(x))

In [26]:
removed_unwanted_string_text_tokens = removed_unwanted_string_text.apply(lambda x:tokenized_text(x))

In [27]:
def removed_repeated_characters(tokens):
    
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word

    correct_tokens = [replace(word) for word in tokens]
    return " ".join(correct_tokens)
    #return correct_tokens

In [28]:
removed_repeated_characters_text = removed_unwanted_string_text_tokens.apply(lambda x:removed_repeated_characters(x))

In [29]:
"""Annotate text token with POS tags"""
def pos_tag_text(text):
    """Convert Penn treebank tag to wordnet tag"""
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return wn.NOUN
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(),penn_to_wn_tags(pos_tag)) for word , pos_tag in tagged_text]
    
    return tagged_lower_text

In [30]:
def lemmatize_text(text):
    """function for lemmatize text based on POS tags"""
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word,pos_tag) if pos_tag else word for word , pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [33]:
lemmatized_text = removed_repeated_characters_text.apply(lambda x:lemmatize_text(x))

In [34]:
def convert_to_set(text):
    """function for convert text to vocabulary """
    return set(text.split())

In [35]:
convert_to_set_text = lemmatized_text.apply(lambda x:convert_to_set(x))

In [36]:
unique_words = list(convert_to_set_text)

In [37]:
vocab = set()

for s in unique_words:
    
    vocab = vocab.union(s)

In [38]:
len(vocab)

10140