In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /home/imerit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/imerit/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/imerit/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/imerit/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
df = pd.read_csv('train.csv')
df.shape

(7613, 5)

In [3]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
df['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
damage                   41
body%20bags              41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [5]:
text_array = df[["text"]].to_numpy()

In [6]:
#remove all hashtags
text = """Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"""
def remove_hashtags(text):
    text = text.replace('#','')
    return text

print(remove_hashtags(text))

Our Deeds are the Reason of this earthquake May ALLAH Forgive us all


In [7]:
#remove all brackets and links
text = """During the 1960s the oryx a symbol of the [Arabian Peninsula] were annihilated by hunters. \nhttp://t.co/yangEQBUQW http://t.co/jQ2eH5KGLt """
def remove_brackets_links(text):
    text = re.sub(r'[\([{})\]]', '', text)
    text = re.sub(r'http\S+', '', text)
    return(text)

print(remove_brackets_links(text))

During the 1960s the oryx a symbol of the Arabian Peninsula were annihilated by hunters. 
  


In [8]:
# Remove all the newlines('\n'), tabs('\t'), "-", "\".   and word end with :
text = """During the 1960s the or-yx a symbol of the [Arabian Peninsula] were annihilated by hunters. \nhttp://t.co/yangEQBUQW http://t.co/jQ2eH5KGLt """
def  remove_escape(text):
    all_lines = [line for line in text.split('\n')]
    new_file = ''


    for each_line in all_lines:
        new_line = ""
        for element in each_line:
            if element == '\t' or element == '-' or element == '':
                continue
            new_line = new_line + element
        new_file+=new_line

    final_file=''
    words = new_file.split()
    for each_word in words:

        if each_word.isdigit():
            continue
        if each_word[-1]==':':
            continue
            
        final_file= final_file + " " +each_word

    text = final_file
    return text

print(remove_escape(text))

 During the 1960s the oryx a symbol of the [Arabian Peninsula] were annihilated by hunters. http://t.co/yangEQBUQW http://t.co/jQ2eH5KGLt


In [9]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [10]:
>>> from nltk import ne_chunk, pos_tag, word_tokenize
>>> from nltk.tree import Tree
>>> 
>>> def get_continuous_chunks(text):
...     chunked = ne_chunk(pos_tag(word_tokenize(text)))
...     continuous_chunk = []
...     current_chunk = []
...     for i in chunked:
...             if type(i) == Tree:
...                     current_chunk.append(" ".join([token for token, pos in i.leaves()]))
...             if current_chunk:
...                     named_entity = " ".join(current_chunk)
...                     if named_entity not in continuous_chunk:
...                             continuous_chunk.append(named_entity)
...                             current_chunk = []
...             else:
...                     continue
...     return continuous_chunk

print(get_continuous_chunks(text))

['Arabian Peninsula']


In [11]:
"""

Text chunking, also referred to as shallow parsing, is a task that 
follows Part-Of-Speech Tagging and that adds more structure to the sentence.
So it combines the some phrases, named entities into single word.
So after that combine all those phrases/named entities by separating "_". 
And remove the phrases/named entities if that is a "Person". 
You can use nltk.ne_chunk to get these. 
Below we have given one example. please go through it. 

useful links: 
https://www.nltk.org/book/ch07.html
https://stackoverflow.com/a/31837224/4084039
http://www.nltk.org/howto/tree.html
https://stackoverflow.com/a/44294377/4084039

"""
def chunking(text):
    label_list = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                label_list.append(chunk.label())


    my_chunks = get_continuous_chunks(text)

    #print(label_list)
    #print(my_chunks)
    i=0
    for each1 in my_chunks:
        #print(i)
        #print(each1)
        #print(label_list)
        #print(my_chunks)
        if i >= len(label_list):
            break
        if label_list[i] == 'PERSON':
            text = text.replace(each1,'')
        if label_list[i] == 'GPE' and len(my_chunks[i].split())>1 :
            new_chunk = ''
            for each in my_chunks[i].split():
                new_chunk = new_chunk+each+"_"
            new_chunk = new_chunk[:-1]
            #print(new_chunk)
            text = text.replace(each1,new_chunk)
        #print("####################################")
        i+=1

    return text
print(chunking(text))

During the 1960s the or-yx a symbol of the [] were annihilated by hunters. 
http://t.co/yangEQBUQW http://t.co/jQ2eH5KGLt 


In [12]:
def keep_only_alphnumeric(text):
    line = ''
    for each in text.split():
        #print(each)
        words = re.sub(r'\W+', '', each)
        line = line + words + ' ' 
    #print(line)
    line = line[:-1]
    return line

keep_only_alphnumeric('Hello there')

'Hello there'

In [13]:
def remove_extras(text):
    all_lines = [line for line in text.split('\n')]
    new_file = ''

    for each_line in all_lines:
        new_line = ""
        for element in each_line:
            new_line = new_line + element
        new_file+=new_line


    final_file=''
    words = new_file.split()
#     print(words)
    for each_word in words:
        if(len(each_word)<=2 or len(each_word)>=15):
            continue
        if each_word.isdigit():
            continue
        if each_word[0]=="_" and each_word[-1]=='_':
            each_word = each_word[1:-1]
        if each_word[0]=='_':
            each_word = each_word[1:]
        if each_word[-1]=='_':
            each_word = each_word[:-1]
        pos = each_word.find('_')
        if pos <= 2 and pos != -1:
            each_word = each_word[pos+1:]
        if(len(each_word)<=2 or len(each_word)>=15):
            continue
        each_word = each_word.lower()
        

        if each_word.isalpha() == False:
            continue
            
        final_file= final_file +each_word + " "
    return final_file[:-1]

In [14]:
def preprocess_text(text_array):
    text_array = list(text_array)
    #print(text_array)
    new_text_array = []
    i = 1
    for each_text in text_array:
        each_text = each_text[0]
        each_text = each_text.lower()
        each_text = remove_hashtags(each_text)
        each_text = remove_brackets_links(each_text)
        each_text = remove_escape(each_text)
        each_text = decontracted(each_text)
        each_text = chunking(each_text)
        each_text = keep_only_alphnumeric(each_text) 
        each_text = remove_extras(each_text)   
        new_text_array.append(each_text)
        #print(new_text_array)
    return new_text_array

In [15]:
new_texts = preprocess_text(text_array)
df['new_texts'] = new_texts
df = df.drop('text', 1)
df = df[['keyword', 'location', 'new_texts', 'target']]

In [16]:
df.to_csv("text_preprocessed.csv")