In [1]:
import re
import numpy as np
import pandas as pd
import generic_io as gi
from collections import defaultdict

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

In [3]:
import editdistance
from langdetect import detect

In [4]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_score, recall_score

In [5]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_selection import SelectPercentile, chi2

### Variable Init

In [124]:
tknzr = TweetTokenizer()

In [125]:
scoring = {'precision': make_scorer(precision_score, average='weighted'),
           'recall': make_scorer(recall_score, average='weighted')}

##### Load the word2vec model

In [130]:
model = gi.load_from_file(file_path='veri/models/w2v_model_earthquake_76516.bin', file_format='pickle')

### Read tweets

In [126]:
def modify_tweet(text):
    text = re.sub("https?[^\s]*", " http://someurl ", text)
    text = re.sub("(?<!\w)@\w{1,15}(?!\w)", " @someuser ", text)
#     text = re.sub("\s\d*(\.|\:)?\d*\s", " digit ", text)
    text = re.sub("(\d+(/|-)\d+(/|-)\d+)", " date ", text)
    text = re.sub("\s\d?\d(:|,|.)\d\d\s?(A|a|P|p)(M|m)", " clock ", text)
#     text = emoji_pattern.sub('[\U0001f600-\U0001f650]', "", text)
    text = re.sub('#.*(\s|\n)', "", text)
    tokenized_tweet = tknzr.tokenize(text.lower())
    return tokenized_tweet

In [127]:
def load_data():
    tweets = []
    for level in range(1,3):
        for step in range(1,5):
            T1 = gi.load_from_file(file_path='veri/training/tweets/SMERP-T' + str(step) + '-level' + str(level) + '-tweets.jsonl', file_format='jsonl')
            for t in T1:
                tokenized_tweet = modify_tweet(t['text'])
                row = {'text': tokenized_tweet, 'label': step}
                if row not in tweets:
                    tweets.append(row)    
    return pd.DataFrame(tweets)

In [148]:
tdf = load_data()

In [149]:
tdf.head(2)

Unnamed: 0,label,text
0,1,"[toronto's, italian, community, already, mobil..."
1,1,"[@someuser, friends, ,, sound, off, !, everyon..."


# SVM Classification Without Normalization

In [72]:
classifier = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, lowercase=True)),
    ('clf', LinearSVC(multi_class='ovr'))
])

In [73]:
scores = cross_validate(classifier, tdf['text'], tdf['label'], scoring=scoring, cv=10, return_train_score=False)

In [219]:
for x in scores:
    if 'test' in x:
        print(x[5:], sum(scores[x])/10)

recall 0.829619851087
precision 0.814832694457


# Contextual Normalization

##### Some numbers

len(model.wv.vocab): 43867

len(long_words): 43010

count_eng: 6903

##### Find word couples by contextual similarity and edit distance

In [40]:
long_words = []
word_couples = []
for word in model.wv.vocab:
    sim_frst = None
    sim_scnd = None
    try:
        if len(word) > 1:
            
            long_words.append(word)
            
            sim_list = model.wv.most_similar(word)
            
            len_first = len(sim_list[0][0])
            ed_first = editdistance.eval(word, sim_list[0][0])
            if sim_list[0][1] > 0.80:
                if len_first >= 6 and ed_first < 3:
                    sim_frst = sim_list[0]
                elif 6 > len_first > 1 and ed_first < 2:
                    sim_frst = sim_list[0]
                
            len_scnd = len(sim_list[1][0])
            ed_scnd = editdistance.eval(word, sim_list[1][0])
            if sim_list[1][1] > 0.80:
                if len_scnd >= 6 and ed_scnd < 3:
                    sim_scnd = sim_list[1]
                elif 6 > len_scnd > 1 and ed_scnd < 2:
                    sim_scnd = sim_list[1]
                
            if sim_frst or sim_scnd:
                
                if sim_frst and sim_scnd:
                    if ed_first > ed_scnd:
                        most_sim = sim_scnd
                    else:
                        most_sim = sim_frst
                elif sim_frst and not sim_scnd:
                    most_sim = sim_frst
                elif sim_scnd and not sim_frst:
                    most_sim = sim_scnd
                    
#                 print(word, most_sim[0], most_sim[1])
                couple = {'word': word, 'most_sim': most_sim[0], 'similarity': most_sim[1]}
                if couple not in word_couples:
                        word_couples.append(couple)
                    
    except Exception as e:
        pass
#         print(e)

##### Some examples

{'most_sim': 'fuck', 'similarity': 0.896193265914917, 'word': 'fucking'},
 
 {'most_sim': 'women', 'similarity': 0.9045039415359497, 'word': 'men'},
 
 {'most_sim': 'earthquake-hit', 'similarity': 0.884941041469574,'word': 'quake-hit'},
 
 {'most_sim': 'reading', 'similarity': 0.9149921536445618, 'word': 'hearing'},
 
 {'most_sim': 'horrible', 'similarity': 0.9077590703964233, 'word': 'terrible'},
 
 {'most_sim': 'levelling', 'similarity': 0.9155386686325073, 'word': 'leveling'},
 
 {'most_sim': 'didnt', 'similarity': 0.8618503212928772, 'word': "didn't"},
 
 {'most_sim': 'worrying', 'similarity': 0.8041735887527466, 'word': 'worring'},
 
 {'most_sim': 'sympathises', 'similarity': 0.9335030317306519, 'word': 'sympathizes'},
 
 {'most_sim': 'telling', 'similarity': 0.8421267867088318, 'word': 'calling'},
 ('#n3rdlife', '#nerdlife'),
 
 ('gurdwara', 'gurudwara'),
 ('gurdwaras', "gurudwara's"),
 ('gurdwaras', 'gurudwaras'),
 
 ('northeast', 'north-east'),
 
 ('richter', 'ritcher'),

 ('#emergency', '#emergenza'),

##### Save the couples to a file

##### Load couples from file

In [195]:
word_couples = gi.load_from_file(file_path='veri/normalization_lists/first_second_non-en_080_546.json')
len(word_couples)

546

##### Create the list of words to be replaced by each other (and convert the list into dataframe)

In [198]:
norm_list = []
for w in word_couples:
    if not re.match(r'.*\d+', w['most_sim']):
        if len(w['word']) < len(w['most_sim']):
            couple = (w['word'], w['most_sim'])
        else:
            couple = (w['most_sim'], w['word'])
        if couple not in norm_list and tuple(reversed(couple)) not in norm_list:
            norm_list.append(couple)
norm_list.sort()
norm_df = pd.DataFrame(norm_list, columns=["word", "most_sim"])
len(norm_df)

229

##### Load the training data

In [207]:
tdf = load_data()
tdf_list = tdf.to_dict(orient="records")

##### Normalization by Replacement

In [208]:
replaced_words = {}
for tweet_dict in tdf_list:
    for i, word in enumerate(tweet_dict['text']):
        if word in list(norm_df['word']):
            rep_word = norm_df.loc[norm_df['word'] == word]['most_sim'].iloc[0]
            couple_str = word + '-' + rep_word
            if couple_str not in replaced_words:
                replaced_words[couple_str] = 0
            replaced_words[couple_str] += 1
            tweet_dict['text'][i] = rep_word

In [211]:
len(replaced_words), sum(replaced_words.values()), replaced_words

(70,
 1146,
 {'#scary-#scared': 1,
  '<b>-</b>': 1,
  'abc-bbc': 16,
  'anguish-#anguish': 1,
  'ap-afp': 29,
  'brits-britons': 12,
  'centre-center': 5,
  'collapse-collapses': 6,
  'comment-comments': 1,
  'continue-continues': 18,
  'da-d:': 1,
  'damages-damaged': 25,
  'donating-donations': 19,
  'donation-donations': 13,
  'ear-eart': 3,
  'earthq-earthqu': 5,
  'earthqu-earthqua': 2,
  'eath-#death': 3,
  'effected-affected': 1,
  'effort-efforts': 9,
  'epicentre-epicenter': 5,
  'ever-never': 1,
  'favorite-favourite': 1,
  'fund-funds': 28,
  "he's-she's": 1,
  'he-she': 6,
  'heads-headed': 1,
  'hearing-reading': 3,
  'ita-ital': 3,
  'launch-launches': 1,
  'leveled-levelled': 3,
  'line-lines': 2,
  'magnitute-#magnitude': 1,
  'make-take': 8,
  'min-0min': 1,
  'monday-sunday': 1,
  "morning-morning's": 18,
  "mornings-morning's": 1,
  'ne-se': 3,
  'northeast-north-east': 4,
  'nytimes-latimes': 1,
  'offer-offers': 7,
  'part-parts': 5,
  'password-passwords': 18,
  '

##### Convert normalized data into dataframe

In [190]:
tdf_normalized = pd.DataFrame(tdf_list)

##### Define the classifier

In [191]:
classifier_norm = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, lowercase=True)),
    ('clf', LinearSVC(multi_class='ovr'))
])

##### Run 10-fold cross-validation with the normalized data

In [192]:
scores_norm = cross_validate(classifier_norm, tdf_normalized['text'], tdf_normalized['label'], scoring=scoring, cv=10, return_train_score=False)

##### Scores of SVM with contextually normalized data

In [221]:
for x in scores_norm:
    if 'test' in x:
        print(x[5:], sum(scores_norm[x])/10)

recall 0.830396518241
precision 0.81446420177


# Text Classification with Word2vec

##### Classifier class

In [222]:
# See: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 100 #len(word2vec.values().next())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, lowercase=True)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

##### Preprocessing for word embeddings

In [60]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

##### Load the training data

In [223]:
tdf = load_data()

##### Define the classifier

In [61]:
classifier_emb = Pipeline([
    ('tfidf', TfidfEmbeddingVectorizer(w2v)),
    ('clf', LinearSVC(multi_class='ovr'))
])

##### Run 10-fold cross-validation with word embeddings

In [62]:
scores_emb = cross_validate(classifier_emb, tdf['text'], tdf['label'], scoring=scoring, cv=10, return_train_score=False)

##### Scores of Text Classification with W2V

In [225]:
for x in scores_emb:
    if 'test' in x:
        print(x[5:], sum(scores_emb[x])/10)

recall 0.792158245338
precision 0.767850891193
