# Speech and natural language processing - TD3

In [1]:
import time
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer
import time
from context2vec.common.model_reader import ModelReader

### Open file

In [2]:
# Put spaces between words starting with
# capital letters using Regex in Python
 
def putSpace(input_str):
    # regex [A-Z][a-z]* means any string starting 
    # with capital character followed by many 
    # lowercase letters 
    words = re.findall('[A-Z][a-z]*', input_str.group(0))
 
    # Change first letter of each word into lower
    # case
    result = []
    for word in words:
        word = chr( ord (word[0]) + 32) + word[1:]
        result.append(word)
    return ' '.join(result)
 

In [3]:
f = open('CorpusBataclan_en.1M.raw.txt')

### Merge raws with corresponding tweet

In [4]:
raw_list = list(f)
rtweet_list = []
for raw in raw_list:
    if raw[:2] == 'RT':
        rtweet_list.append(raw)
    else:
        rtweet_list[-1] = rtweet_list[-1] + raw

In [5]:
print 'Number of raws:', len(raw_list)
print 'Number of effective retweets:', len(rtweet_list)

Number of raws: 1000000
Number of effective retweets: 598663


In [6]:
rtweet_list[:1]

["RT @heartfeIts: It's disappointing how people nowadays think terrorism is linked to a religion..\n"]

### Clean tweets

In [7]:
start = time.time()
tweets_to_clean = list(rtweet_list)
rtweet_list = []
for tweet in tweets_to_clean:
    
    # Remove URL
    clean_tweet = re.sub(r"http\S+", "", tweet)
    
    # Remove 'RT'
    clean_tweet = re.sub(r"RT ", "", clean_tweet)
    
    # Remove tags
    clean_tweet = re.sub(r"@\S+ ", "", clean_tweet)
    
    # Remove \n
    clean_tweet = re.sub(r"\n", " ", clean_tweet)
    
    # Remove hex characters
    clean_tweet = re.sub(r'[^\x00-\x7f]',r'', clean_tweet) 
    
    # Deal with #
    clean_tweet = re.sub(r'#\S+',putSpace, clean_tweet)

    # Remove # and *
    clean_tweet = re.sub(r'#','', clean_tweet)
    clean_tweet = re.sub(r'\*','', clean_tweet)
    
    # Convert uppercase to lowercase
    clean_tweet = clean_tweet.lower() # need to remove uppercase to compute edit distance to the words in dictionary
    
    # Add to the list
    rtweet_list.append(clean_tweet)
    
stop = time.time()
print 'Time to clean tweets: ', stop-start

Time to clean tweets:  19.7470469475


In [8]:
print 'Number of effective retweets:', len(rtweet_list)

Number of effective retweets: 598663


In [9]:
for idx,tweet in enumerate(rtweet_list[:100]):
    print str(idx)+'.', tweet

0. it's disappointing how people nowadays think terrorism is linked to a religion.. 
1. israel killing muslims everyday and no one bats an eye. terrorist attack and muslims got the blame? how shallow can you be?   that bitch stops a show because someone spilled water on stage but puts on a show when terrorists attacki 
2. french president francois hollande condemns the "terrorist attacks of unprecedented proportions." pray for paris  
3. ny lights in blue, white &amp; red as we stand in solidarity with the people of france:   
4. my name is malik riaz. i am a muslim. i condemn the paris attack over 1.5 billion muslims do.   please remember this. good on you, paris!  facebook is asking people in paris to check in if they're safe  
5. prayers to the victims and survivors of the paris tragedy. we must not let the hate of a few come between us as humans on 
6. omni hotel in dallas is displaying french flag in solidarity with france.  
7. kkk murders ppl white people: ok but not all white p

### Correcting tweets

In [10]:
start = time.time()
model_param_file = './MODEL_DIR/context2vec.mscc.model.params'
model_reader = ModelReader(model_param_file)
word2index = model_reader.word2index
stop = time.time()
print "Time to import model context2vec:", stop-start

Reading config file: ./MODEL_DIR/context2vec.mscc.model.params
Config:  {'config_path': './MODEL_DIR/', 'model_file': 'context2vec.mscc.model', 'deep': 'yes', 'drop_ratio': '0.0', 'words_file': 'context2vec.mscc.words.targets', 'unit': '300'}
Time to import model context2vec: 46.0143020153


In [11]:
# setup tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [12]:
uncorrect_words = []
for idx,tweet in enumerate(rtweet_list):
    tokenize_tweet = tokenizer.tokenize(rtweet_list[1])
    for token in tokenize_tweet:
        if token not in word2index:
            if token not in uncorrect_words:
                uncorrect_words.append(token)

In [13]:
uncorrect_words

['muslims', 'terrorists', 'attacki']

In [26]:
from src.levenshtein_distance import levenshtein_distance

for uncorrect_word in uncorrect_words:
    min_dist = np.inf
    for word_dict in word2index:
        dist = levenshtein_distance(uncorrect_word, word_dict)
        if dist<min_dist:
            min_dist = dist
            correct_word = word_dict
    print uncorrect_word, min_dist, correct_word

muslims 1.0 muslins
terrorists 1.0 terrorist
attacki 1.0 attacks
