In [151]:
import re, string, unicodedata
import pandas as pd
import nltk
from contractions import CONTRACTION_MAP
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [154]:
df = pd.read_csv('dataset.csv')
test = df.copy()

In [155]:
test.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count
0,Just happened a terrible car crash,Relevant,just happened a terrible car crash,-1.0,34,6
1,Our Deeds are the Reason of this #earthquake M...,Relevant,our deeds are the reason of this earthquake m...,0.0,69,13
2,"Heard about #earthquake is different cities, s...",Relevant,heard about earthquake is different cities s...,0.25,64,9
3,"there is a forest fire at spot pond, geese are...",Relevant,there is a forest fire at spot pond geese are...,0.0,96,19
4,Forest fire near La Ronge Sask. Canada,Relevant,forest fire near la ronge sask canada,0.1,38,7


In [156]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

In [157]:
test['contracted'] = test['text'].apply(lambda x: expand_contractions(x))

In [158]:
# Tokenizing
def tokenize(text):
    tweet_tokens = re.split('\W+', text)
    return tweet_tokens

test['tokens'] = test['contracted'].apply(lambda x: tokenize(x)).values

In [159]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, get_wordnet_pos(word))
        lemmas.append(lemma)
    return lemmas

In [160]:
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = lemmatize_verbs(words)
    return words

In [161]:
test['normalize'] = test['tokens'].apply(lambda x: normalize(x))

In [163]:
test.tail(25)

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count,contracted,tokens,normalize
10851,RT @LivingSafely: #NWS issues Severe #Thunders...,Relevant,rt nws issues severe thunderstorm warnings...,0.5,99,16,RT @LivingSafely: #NWS issues Severe #Thunders...,"[RT, LivingSafely, NWS, issues, Severe, Thunde...","[rt, livingsafely, nw, issue, severe, thunders..."
10852,#??? #?? #??? #??? MH370: Aircraft debris foun...,Relevant,mh370 aircraft debris foun...,-0.2,100,12,#??? #?? #??? #??? MH370: Aircraft debris foun...,"[, MH370, Aircraft, debris, found, on, La, Reu...","[mh370, aircraft, debris, found, la, reunion, ..."
10853,Father-of-three Lost Control of Car After Over...,Relevant,father of three lost control of car after over...,0.0,92,12,Father-of-three Lost Control of Car After Over...,"[Father, of, three, Lost, Control, of, Car, Af...","[father, three, lose, control, car, overtake, ..."
10854,1.3 #Earthquake in 9Km Ssw Of Anza California ...,Relevant,1 3 earthquake in 9km ssw of anza california ...,0.5,109,18,1.3 #Earthquake in 9Km Ssw Of Anza California ...,"[1, 3, Earthquake, in, 9Km, Ssw, Of, Anza, Cal...","[one, three, earthquake, 9km, ssw, anza, calif..."
10855,Evacuation order lifted for town of Roosevelt:...,Relevant,evacuation order lifted for town of roosevelt,0.0,48,7,Evacuation order lifted for town of Roosevelt:...,"[Evacuation, order, lifted, for, town, of, Roo...","[evacuation, order, lift, town, roosevelt, htt..."
10856,See the 16yr old PKK suicide bomber who detona...,Relevant,see the 16yr old pkk suicide bomber who detona...,0.1,106,17,See the 16yr old PKK suicide bomber who detona...,"[See, the, 16yr, old, PKK, suicide, bomber, wh...","[see, 16yr, old, pkk, suicide, bomber, detonat..."
10857,To conference attendees! The blue line from th...,Relevant,to conference attendees the blue line from th...,0.0,129,23,To conference attendees! The blue line from th...,"[To, conference, attendees, The, blue, line, f...","[conference, attendee, blue, line, airport, de..."
10858,The death toll in a #IS-suicide car bombing on...,Relevant,the death toll in a is suicide car bombing on...,0.0,136,28,The death toll in a #IS-suicide car bombing on...,"[The, death, toll, in, a, IS, suicide, car, bo...","[death, toll, suicide, car, bombing, ypg, posi..."
10859,#breaking #LA Refugio oil spill may have been ...,Relevant,breaking la refugio oil spill may have been ...,0.0,77,12,#breaking #LA Refugio oil spill may have been ...,"[, breaking, LA, Refugio, oil, spill, may, hav...","[break, la, refugio, oil, spill, may, costlier..."
10860,a siren just went off and it wasn't the Forney...,Relevant,a siren just went off and it wasn't the forney...,0.0,65,12,a siren just went off and it was not the Forne...,"[a, siren, just, went, off, and, it, was, not,...","[siren, go, forney, tornado, warn]"


In [166]:
# Stitching together
tokens = []
for i,d in enumerate(test['normalize']):
    tokens.append(' '.join(d))

In [167]:
test['normalize_stitched'] = tokens

In [171]:
# Removing "Can't Decide" rows
test.drop(test[test['choose_one'] == "Can't Decide"].index, inplace=True)

In [174]:
test = test.reset_index(drop=True)

In [176]:
test.choose_one.value_counts()

Not Relevant    6187
Relevant        4673
Name: choose_one, dtype: int64

In [177]:
def encoder(x):
    if x == 'Relevant':
        return 1
    if x == 'Not Relevant':
        return 0

In [178]:
test['choose_one'] = test['choose_one'].apply(lambda x: encoder(x))

In [179]:
test.choose_one.value_counts()

0    6187
1    4673
Name: choose_one, dtype: int64

In [182]:
test.to_pickle('final-data2.pickle')