In [1]:
import nltk
import string
import re
import numpy as np
import pandas as pd
from contractions import contractions_dict
from pattern.en import tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

In [2]:
wnl = WordNetLemmatizer()

In [3]:
stopwords_list = nltk.corpus.stopwords.words('english')

In [4]:
data = pd.read_csv('tweets.csv')

In [5]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [6]:
data.drop(['keyword','location','id'],axis = 1, inplace = True)

In [7]:
data.head()

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership https:...,1
3,Arsonist sets cars ablaze at dealership https:...,1
4,"""Lord Jesus, your love brings freedom and pard...",0


In [8]:
data.tail()

Unnamed: 0,text,target
11365,Media should have warned us well in advance. T...,0
11366,i feel directly attacked 💀 i consider moonbin ...,0
11367,i feel directly attacked 💀 i consider moonbin ...,0
11368,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0
11369,Jake Corway wrecked while running 14th at IRP.,1


In [15]:
data['text'][4446]

'This is the thing that has infuriated me. If Black Democrats were paying freaking attention, it should have infuriated t…'

In [46]:
def removed_url(text):
    """function for removed url"""
    url_pattern = re.compile(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*')
    filtered_text = re.sub(url_pattern,'',text)
    return filtered_text

In [47]:
removed_url_text = data['text'].apply(lambda x : removed_url(x))

In [53]:
removed_url_text[1111]

'I became lost in a Himalayan blizzard at 4000 meters and was saved by a horse. '

In [54]:
def removed_html(text):
    """function for removed html"""
    html_pattern = re.compile(r'<.*?>')
    filtered_text = re.sub(html_pattern,'',text)
    return filtered_text

In [55]:
removed_html_text = removed_url_text.apply(lambda x : removed_html(x))

In [81]:
removed_html_text[111]

'We continue to update our 30-day aftershock forecast scenarios for #PuertoRico. Forecasts are posted in both English …'

In [82]:
def removed_unwanted_characters(text):
    """function for removing forward_slash_followed_by_string between the text"""
    backslash_pattern = re.compile(r'(\r)+(\n)*(\r)+(\n)*($)(\w)+|(\r)+(\n)*(\r)+(\n)*(#)(\w)+|(#)(\w)+|(&)(\w)+|(@)(\w)+|($)(\w)+|[?/(\r)+(\n)*(#)*(_)*(\d)+]+')
    filtered_text = re.sub(backslash_pattern,' ', text)
    return filtered_text

In [83]:
removed_unwanted_characters_text = removed_html_text.apply(lambda x : removed_unwanted_characters(x))

In [86]:
removed_unwanted_characters_text[111]

'We continue to update our  -day aftershock forecast scenarios for  . Forecasts are posted in both English …'

In [87]:
def tokenized_text(text):
    """function for tokenized tweets text"""
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [88]:
def expand_contraction(text,contractions_dict):
    """function for removed the contraction words"""
    contraction_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                    flags = re.IGNORECASE|re.DOTALL)
    """function for expand match"""
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expand_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        
        #expanded_contraction = first_char + contraction[1:]
        return expand_contraction
    
    expanded_text = contraction_pattern.sub(expand_match,text)
    expanded_text = re.sub("'","",expanded_text)
    return expanded_text

In [89]:
expand_contraction_text = removed_unwanted_characters_text.apply(lambda x : expand_contraction(x,contractions_dict))

In [90]:
expand_contraction_text[111]

'We continue to update our  -day aftershock forecast scenarios for  . Forecasts are posted in both English …'

In [91]:
def removed_special_characters(text):
    """function for removed special characters from the text"""
    tokens = tokenized_text(text)
    special_characters_pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filter_tokens = filter(None,[special_characters_pattern.sub('',token) for token in tokens])
    filtered_text = ' '.join(filter_tokens)
    return filtered_text

In [92]:
removed_special_characters_text = expand_contraction_text.apply(lambda x:removed_special_characters(x) )

In [93]:
removed_special_characters_text[111]

'We continue to update our day aftershock forecast scenarios for Forecasts are posted in both English …'

In [94]:
def removed_emoji(text):
    """Function for remove Emoji from the text"""
    pattern = re.compile(r'[^\u1F600-\u1F6FF\s]')
    filtered_text = re.sub(pattern,'',text)
    return filtered_text

In [95]:
removed_emoji_text = removed_special_characters_text.apply(lambda x:removed_emoji(x))

In [96]:
removed_emoji_text[111]

'We continue to update our day aftershock forecast scenarios for Forecasts are posted in both English '

In [97]:
def removed_unicode(text):
    """function for removed unicode from the text"""
    unicode_pattern = re.compile(r'\w+[^\x00-\x7F]\w+|[^\x00-\x7F]')
    filtered_text = re.sub(unicode_pattern,'',text)
    return filtered_text

In [98]:
removed_unicode_text = removed_emoji_text.apply(lambda x:removed_unicode(x))

In [99]:
removed_unicode_text[111]

'We continue to update our day aftershock forecast scenarios for Forecasts are posted in both English '

In [100]:
def removed_stopwords(text):
    """function for removed stopwords from the text"""
    tokens = tokenized_text(text)
    filtered_text = [token for token in tokens if token not in stopwords_list]
    filtered_text = ' '.join(filtered_text)
    return filtered_text

In [101]:
removed_stopwords_text = removed_unicode_text.apply(lambda x:removed_stopwords(x))

In [102]:
removed_stopwords_text[111]

'We continue update day aftershock forecast scenarios Forecasts posted English'

In [103]:
def removed_unwanted_string(text):
    """function for removed two lenght string from text"""
    string_pattern = re.compile(r'\b(\w{1})\b|\b(\w{2})\b|\b(\w{3})\b')
    filered_text = re.sub(string_pattern,'',text)
    return filered_text

In [104]:
removed_unwanted_string_text = removed_stopwords_text.apply(lambda x:removed_unwanted_string(x))

In [105]:
removed_unwanted_string_text[111]

' continue update  aftershock forecast scenarios Forecasts posted English'

In [106]:
"""Annotate text token with POS tags"""
def pos_tag_text(text):
    """Convert Penn treebank tag to wordnet tag"""
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return wn.NOUN
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(),penn_to_wn_tags(pos_tag)) for word , pos_tag in tagged_text]
    
    return tagged_lower_text

In [107]:
def lemmatize_text(text):
    """function for lemmatize text based on POS tags"""
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word,pos_tag) if pos_tag else word for word , pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [110]:
lemmatized_text = removed_unwanted_string_text.apply(lambda x:lemmatize_text(x))

In [113]:
lemmatized_text[11322]

'reality bend around chromatic form effortlessly float wreckage check retribution rhads'

In [117]:
def convert_to_set(text):
    """function for convert text to vocabulary """
    return set(text.split())

In [118]:
convert_to_set_text = lemmatized_text.apply(lambda x:convert_to_set(x))

In [119]:
unique_words = list(convert_to_set_text)

In [120]:
vocab = set()

for s in unique_words:
    
    vocab = vocab.union(s)

In [122]:
vocab

{'doityourself',
 'brainless',
 'babysitting',
 'deepika',
 'larry',
 'toffler',
 'virus',
 'jaish',
 'poten',
 'gerard',
 'loosely',
 'unfollowing',
 'yes',
 'veles',
 'getter',
 'pernambuco',
 'sanghi',
 'nonaccidental',
 'lapina',
 'thrash',
 'astagfirullah',
 'peek',
 'fang',
 'cooker',
 'bastard',
 'ikonic',
 'faryab',
 'hack',
 'pollen',
 'hacker',
 'eastbound',
 'construct',
 'mallam',
 'expressway',
 'partial',
 'hanga',
 'strang',
 'muscular',
 'alexander',
 'steak',
 'within',
 'hapo',
 'company',
 'bridal',
 'ogiggling',
 'chavez',
 'sauce',
 'dread',
 'diet',
 'take',
 'though',
 'parac',
 'schenker',
 'antitrans',
 'bailout',
 'plate',
 'serpentes',
 'closer',
 'appearing',
 'weaponsyielding',
 'recover',
 'qasem',
 'counselling',
 'malignant',
 'uptick',
 'defective',
 'graphqljit',
 'limitless',
 'aircraft',
 'taehyungs',
 'pste',
 'ewan',
 'unsaid',
 'kaabi',
 'nara',
 'bigand',
 'volted',
 'valiasr',
 'involvement',
 'bag',
 'coast',
 'excrement',
 'onthegr',
 'evacuat

In [36]:
def vocabulary_tokens(text):
    """function for converting whole text to vocabulary"""    
    vocabulary  = set()
    for text in lemmatized_text:
        for word in text.split():
            vocabulary.add(word)
    return vocabulary