# Speech and natural language processing - TD3

In [1]:
import time
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer
import time
from tqdm import tqdm
from context2vec.common.model_reader import ModelReader
from src.clean_utils import is_number, clean_tweet
from src.levenshtein_distance import levenshtein_distance
from context2vec.eval.explore_context2vec_func import explore_context2vec

### Open file

In [2]:
first_raw = 0
last_raw = 10
n_result = 1000  # number of search result to show
f = open('CorpusBataclan_en.1M.raw.txt')

### Merge raws with corresponding tweet

In [3]:
raw_list = list(f)
rtweet_list = []
for raw in raw_list:
    if raw[:2] == 'RT':
        rtweet_list.append(raw)
    else:
        rtweet_list[-1] = rtweet_list[-1] + raw

In [4]:
print 'Number of raws:', len(raw_list)
print 'Number of effective retweets:', len(rtweet_list)

Number of raws: 1000000
Number of effective retweets: 598663


### Clean tweets

In [5]:
start = time.time()
tweets_to_clean = list(rtweet_list)
rtweet_list = []
for tweet in tqdm(tweets_to_clean[first_raw:last_raw]):
    
    tweet_cleaned = clean_tweet(tweet)
    
    # Add to the list
    rtweet_list.append(tweet_cleaned)
    
stop = time.time()
print 'Time to clean tweets: ', stop-start

100%|██████████| 10/10 [00:00<00:00, 4345.98it/s]

Time to clean tweets:  0.0323951244354





In [6]:
#for idx,tweet in enumerate(rtweet_list[:100]):
#    print str(idx)+'.', tweet

### Correcting tweets

In [7]:
start = time.time()
model_param_file = './MODEL_DIR/context2vec.ukwac.model.params'
model_reader = ModelReader(model_param_file)
w = model_reader.w
word2index = model_reader.word2index
index2word = model_reader.index2word
model = model_reader.model
stop = time.time()
print "Time to import model context2vec:", stop-start

target_exp = re.compile('\[.*\]')
# setup tokenizer
tokenizer = RegexpTokenizer(r'\w+')

Reading config file: ./MODEL_DIR/context2vec.ukwac.model.params
Config:  {'config_path': './MODEL_DIR/', 'model_file': 'context2vec.ukwac.model', 'deep': 'yes', 'drop_ratio': '0.0', 'words_file': 'context2vec.ukwac.words.targets', 'unit': '300'}
Time to import model context2vec: 66.8794682026


In [62]:
normalised_rtweet_list = []
for idx,tweet in tqdm(enumerate(rtweet_list[first_raw:last_raw])):
    print idx, tweet
    tokenize_tweet = tokenizer.tokenize(tweet)
    
    for token in tokenize_tweet:
        if is_number(token):
            continue
            
        #find uncorrect words
        if token not in word2index:  
            print ">> uncorrect word:", token
            
            #generate context
            context = re.sub(token, '[]', tweet)
            context_proposition = explore_context2vec(context, w, word2index, index2word, model, target_exp, n_result)
            
            #find clother word in context
            min_dist = np.inf 
            for proposition in context_proposition:
                dist = levenshtein_distance(token,proposition)
                if dist < min_dist:
                    min_dist = dist
                    correct_word = proposition[0]
            print ">> correction:", correct_word
            print '>> Levenshtein distance:', min_dist
            correct_tweet = re.sub(token, correct_word, tweet)
        else:
            correct_tweet = tweet
    print correct_tweet, '\n'
    
    normalised_rtweet_list.append(correct_tweet)

0it [00:00, ?it/s]

0 it's disappointing how people nowadays think terrorism is linked to a religion.. 
it's disappointing how people nowadays think terrorism is linked to a religion..  

1 israel killing muslims everyday and no one bats an eye. terrorist attack and muslims got the blame? how shallow can you be?   that bitch stops a show because someone spilled water on stage but puts on a show when terrorists attacki 
>> uncorrect word: attacki


2it [00:00,  5.76it/s]

>> correction: tricia
>> Levenshtein distance: 7.0
israel killing muslims everyday and no one bats an eye. terrorist attack and muslims got the blame? how shallow can you be?   that bitch stops a show because someone spilled water on stage but puts on a show when terrorists tricia  

2 french president francois hollande condemns the "terrorist attacks of unprecedented proportions." pray for paris  
>> uncorrect word: hollande


10it [00:00, 13.81it/s]

>> correction: <BOS>
>> Levenshtein distance: 8.0
french president francois hollande condemns the "terrorist attacks of unprecedented proportions." pray for paris   

3 ny lights in blue, white &amp; red as we stand in solidarity with the people of france:   
ny lights in blue, white &amp; red as we stand in solidarity with the people of france:    

4 my name is malik riaz. i am a muslim. i condemn the paris attack over 1.5 billion muslims do.   please remember this. good on you, paris!  facebook is asking people in paris to check in if they're safe  
my name is malik riaz. i am a muslim. i condemn the paris attack over 1.5 billion muslims do.   please remember this. good on you, paris!  facebook is asking people in paris to check in if they're safe   

5 prayers to the victims and survivors of the paris tragedy. we must not let the hate of a few come between us as humans on 
prayers to the victims and survivors of the paris tragedy. we must not let the hate of a few come between us a




In [31]:
uncorrect_words = ['hollande']
for uncorrect_word in uncorrect_words:
    min_dist = np.inf
    for word_dict in word2index:
        dist = levenshtein_distance(uncorrect_word, word_dict)
        if dist < min_dist:
            min_dist = dist
            correct_word = word_dict
    print uncorrect_word, min_dist, correct_word

hollande 1.0 holland
