In [3]:
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Flatten, add, Embedding, SimpleRNN, GRU, LSTM, Masking
from keras.layers import Conv2D, Dense, Activation, Concatenate
from keras.layers.core import Activation, Reshape
from keras.layers import Dropout, BatchNormalization, MaxPooling2D, Conv2D
from keras.utils import np_utils, to_categorical
from keras.optimizers import SGD, Adam
from keras.regularizers import l2
from keras import regularizers
import pandas as pd
from itertools import repeat
from copy import deepcopy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# 0. Load Data

In [4]:
train_data_raw = np.genfromtxt('train.tsv',delimiter ='\t', dtype = "str")
val_data_raw = np.genfromtxt('val.tsv',delimiter ='\t', dtype = "str")
test_data_raw = np.genfromtxt('test.tsv',delimiter ='\t', dtype = "str")

In [5]:
train_data = []
val_data = []
test_data = []

for i in range(len(train_data_raw)):
    train_data.append([train_data_raw[i][0], ['<S>'] + list(train_data_raw[:,1][i]) + ['</S>']])
for i in range(len(val_data_raw)):
    val_data.append([val_data_raw[i][0], ['<S>'] + list(val_data_raw[:,1][i]) + ['</S>']])
for i in range(len(test_data_raw)):
    test_data.append([test_data_raw[i][0], ['<S>'] + list(test_data_raw[:,1][i]) + ['</S>']])

# find the vector len by calculating max length
vec_len = 0
for i in range(len(train_data)):
    if len(train_data[i][1]) > vec_len:
        vec_len = len(train_data[i][1])
for i in range(len(val_data)):
    if len(val_data[i][1]) > vec_len:
        vec_len = len(val_data[i][1])
for i in range(len(test_data)):
    if len(test_data[i][1]) > vec_len:
        vec_len = len(test_data[i][1])
print("Vector length =", vec_len)

# padding with </S>
for i in range(len(train_data)):
    if len(train_data[i][1]) < vec_len:
        pads = ['</S>' for c in range(vec_len - len(train_data[i][1]))]
        train_data[i][1] += pads
for i in range(len(test_data)):
    if len(test_data[i][1]) < vec_len:
        pads = ['</S>' for c in range(vec_len - len(test_data[i][1]))]
        test_data[i][1] += pads
for i in range(len(val_data)):
    if len(val_data[i][1]) < vec_len:
        pads = ['</S>' for c in range(vec_len - len(val_data[i][1]))]
        val_data[i][1] += pads

train_data_np = np.zeros((len(train_data), vec_len+1), dtype=object)
val_data_np = np.zeros((len(val_data), vec_len+1), dtype=object)
test_data_np = np.zeros((len(test_data), vec_len+1), dtype=object)

for i in range(len(train_data)):
    train_data_np[i] = np.append(np.array(train_data[i][0]), np.array(train_data[i][1]))
for i in range(len(val_data)):
    val_data_np[i] = np.append(np.array(val_data[i][0]), np.array(val_data[i][1]))
for i in range(len(test_data)):
    test_data_np[i] = np.append(np.array(test_data[i][0]), np.array(test_data[i][1]))

print('train data samples =', train_data_np.shape)
print('test data samples =', test_data_np.shape)
print('val data samples =', val_data_np.shape)
print()
print(train_data_np[1])

Vector length = 161
train data samples = (80175, 162)
test data samples = (14960, 162)
val data samples = (11759, 162)

['en' '<S>' 'I' ' ' 'h' 'a' 'v' 'e' ' ' 'g' 'a' 'i' 'n' 'e' 'd' ' ' 'l'
 'e' 'v' 'e' 'l' ' ' '3' '9' ' ' 'i' 'n' ' ' 'T' 'h' 'e' ' ' 'T' 'r' 'i'
 'b' 'e' 'z' ' ' 'a' 'n' 'd' ' ' 'C' 'a' 's' 't' 'l' 'e' 'z' '!' ' ' 'C'
 'a' 'n' ' ' 'y' 'o' 'u' ' ' 'a' 'c' 'h' 'i' 'e' 'v' 'e' ' ' 'i' 't' ' '
 'a' 's' ' ' 'w' 'e' 'l' 'l' '?' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>' '</S>'
 '</S>' '</S>' '<

In [6]:
# Vocab list initialized with start, stop and Out-Of-Vocab tokens respectively
vocab = {'<S>':0, '</S>':0, 'OOV':0}
total_char_count = 0

# for each sample
for i in range(len(train_data_np)):
    # for each char in tweet
    for c in list(train_data_np[i,1:]):
        if c in vocab:
            vocab[c] += 1
        else:
            total_char_count += 1
            vocab[c] = 1

# calculate OOV tokens
oov_chars = []
for c in vocab:
    if c != 'OOV' and vocab[c] < 10:
        vocab['OOV'] += 1
        oov_chars.append(c)

# remove OOV tokens
for c in oov_chars:
    vocab.pop(c)

In [7]:
# Vocabulory encodings
vocab_encoding = {}
count = 0
for c in vocab:
  vocab_encoding[c] = count
  count += 1

encode_char = np.vectorize(lambda c : vocab_encoding[c] if c in vocab else vocab_encoding['OOV'])
train_data_np[:,1:] = encode_char(train_data_np[:,1:])
val_data_np[:,1:] = encode_char(val_data_np[:,1:])
test_data_np[:,1:] = encode_char(test_data_np[:,1:])

np.array(train_data_np[:,1:], dtype=int)

array([[ 0,  3,  4, ...,  1,  1,  1],
       [ 0, 30, 10, ...,  1,  1,  1],
       [ 0, 42,  9, ...,  1,  1,  1],
       ...,
       [ 0, 49, 18, ...,  1,  1,  1],
       [ 0, 66, 22, ...,  1,  1,  1],
       [ 0, 49, 26, ...,  1,  1,  1]])

In [8]:
print(train_data_np[2,:])
print(val_data_np[2,:])
print(test_data_np[2,:])

['pt' 0 42 9 8 7 4 6 5 8 15 5 10 43 10 12 5 44 15 7 16 10 6 7 12 10 7 9 8
 18 7 10 15 5 8 31 28 10 15 5 12 15 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
['es' 0 34 5 6 5 13 7 13 9 28 12 10 162 10 162 10 162 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
['en' 0 53 28 22 13 10 9 6 7 14 9 8 7 15 9 28 8 10 28 57 10 55 20 92 10 40
 9 11 10 31 5 13 10 9 12 10 7 4 40 7 39 12 10 35 5 15 15 5 13 10 15 31 7 8
 10 15 31 5 10 13 5 7 4 9 15 39 16 10 17 28 12 10 40 31 5 8 10 12 31 5 10
 17 28 6 5 12 10 9 8 15 28 10 18 10 31 28 22 12 5 10 7 8 18 10 13 5 6 28
 1

In [10]:
x_train = np.array(train_data_np[:,1:], dtype=int)
y_train = np.array(train_data_np[:,0])

x_test = np.array(test_data_np[:,1:], dtype=int)
y_test = np.array(test_data_np[:,0])

x_val = np.array(val_data_np[:,1:], dtype=int)
y_val = np.array(val_data_np[:,0])


classes = np.unique(y_val)
print(len(classes))
print(classes)


i=-1
for c in classes:
    i=i+1
    y_train = np.where(y_train == c,i,y_train)
    y_test = np.where(y_test == c,i,y_test)
    y_val = np.where(y_val == c,i,y_val)


9
['ca' 'de' 'en' 'es' 'eu' 'fr' 'gl' 'it' 'pt']


In [11]:
np.savez('train_data.npz', X = x_train, y = y_train)
np.savez('val_data.npz', X = x_val, y = y_val)
np.savez('test_data.npz', X = x_test, y = y_test)

# 1. Warm-up: Perplexity of a Unigram Model

## a. Create vocabulary 

In [3]:
#Define your vocabulary to be the set of characters that occur at least 10 times in the training data.
#Everything else should be mapped to a special token for out-of-vocabulary tokens that should also be included 
#There are two additional special tokens that should be included in the vocabulary: 
#the start token <S> and the end token </S>. These are added to the beginning and the end of each Tweet.

vocab = {}
vocab_val = {}
vocab_test = {}
max_len = 0
max_len_val = 0
max_len_test = 0

for i in range(0, len(train)):
    
    tweet_train = train[i][1]
    chars_train = list(tweet_train)
    max_len = max(max_len, len(chars_train))
        
    for ch in chars_train:
        
        if ch in vocab.keys():
            vocab[ch] = vocab[ch] + 1
        else:
            vocab[ch] = 1

for i in range(0, len(val)):
    
    tweet_val = val[i][1]
    chars_val = list(tweet_val)
    max_len_val = max(max_len_val, len(chars_val))        
    for ch in chars_val:
        
        if ch in vocab_val.keys():
            vocab_val[ch] = vocab_val[ch] + 1
        else:
            vocab_val[ch] = 1
            
for i in range(0, len(test)):
    
    tweet_test = test[i][1]
    chars_test = list(tweet_test)
    max_len_test = max(max_len_test, len(chars_test))
        
    for ch in chars_test:
        
        if ch in vocab_test.keys():
            vocab_test[ch] = vocab_test[ch] + 1
        else:
            vocab_test[ch] = 1           

In [4]:
print(max_len)
print(max_len_val)
print(max_len_test)

152
159
152


In [8]:
vocab['<S>'] = len(train)
vocab['</S>'] = len(train)
vocab['Oov']  = 0
vocab_ = {}

for k,v in vocab.items():
    
    if (v < 10):
        vocab['Oov'] = vocab['Oov'] + 1
    else:
        vocab_[k] = v
        
vocab_['Oov'] = vocab['Oov']


In [9]:
print('Size of the dictionary: ', len(vocab_))
print('Percent of out of vocabulary words: %f%%' % (vocab_['Oov']/sum(vocab_.values())*100))

Size of the dictionary:  494
Percent of out of vocabulary words: 0.016351%


## b. Entropy and Perplexity 

In [10]:
#Compute relative frequency

total_nw = sum(vocab_.values()) - vocab_['<S>']
relative_freq_train = {}


for k,v in vocab_.items():
    
        if (v == 0):
            relative_freq_train[k] = 1
        else:
            relative_freq_train[k] = v/total_nw


In [11]:
vocab_val = deepcopy(vocab_)
vocab_val = dict.fromkeys(vocab_val, 0)

for i in range(0,len(val)):
    
    char_list = list(val[i][1])
    
    for j in char_list:
        
        if j in vocab_val.keys():
            vocab_val[j] = vocab_val[j] + 1

In [12]:
vocab_val['<S>'] = len(val)
vocab_val['</S>'] = len(val)

total_nw_val = sum(vocab_val.values()) - vocab_val['<S>']
relative_freq_val = {}


for k,v in vocab_val.items():
    
    if (k != '<S>'):
        
        if (v == 0):
            relative_freq_val[k] = 1
        else:
            relative_freq_val[k] = v/total_nw_val
        

In [13]:
#Compute entropy
entropy = 0

for k,v in relative_freq_val.items():
    
    entropy = entropy + (relative_freq_train[k]*np.log2(relative_freq_val[k]))

entropy = -1*entropy
perplexity = 2**(entropy)    

print('Entropy :', entropy)
print('Perplexity :', perplexity)


Entropy : 5.023521530828288
Perplexity : 32.526000522678245


# 2. Option B: Recurrent Neural Network

# 3. Preprocessing 

In [32]:
#In this problem, sequence length refers to the number of characters in a Tweet and batch
#size refers to the number of Tweets processed in parallel. Batch sizes greater than one
#are used to get better estimates of the gradients.
#Prepend all of the sequences with the start of sentence token <S> and append the end
#of sentence token </S>. It is convienent to define a maximum sequence length and then
#pad all the sequences to have that length. You can use the </S> token for padding.
#Keep track of the original (before padding) length of each sequence. You also need to
#create a table that can map characters to ids and a separate table that maps languages
#to ids.

batch_size = 32
max_tweet_len = 161
og_size = []
padded_tweets = []

      
for i in range(0, len(train)):
    
    tweet = train[i][1]
    og_size.append(len(tweet))
    tweet_char = list(tweet)
    tweet_char.insert(0, '<S>')
    
    if len(tweet_char) < max_tweet_len:
        tweet_char.extend(repeat('</S>', max_tweet_len - len(tweet_char)))
    padded_tweets.append(tweet_char)

padded_tweets= np.array(padded_tweets)


In [33]:
#np.savez('padded.npz', tweets = padded_tweets)
print(len(padded_tweets[0]))

161


In [27]:
#In this problem, sequence length refers to the number of characters in a Tweet and batch
#size refers to the number of Tweets processed in parallel. Batch sizes greater than one
#are used to get better estimates of the gradients.
#Prepend all of the sequences with the start of sentence token <S> and append the end
#of sentence token </S>. It is convienent to define a maximum sequence length and then
#pad all the sequences to have that length. You can use the </S> token for padding.
#Keep track of the original (before padding) length of each sequence. You also need to
#create a table that can map characters to ids and a separate table that maps languages
#to ids.

batch_size = 32
max_tweet_len = 161
og_size = []
padded_tweets_val = []

      
for i in range(0, len(val)):
    
    tweet = val[i][1]
    og_size.append(len(tweet))
    tweet_char = list(tweet)
    tweet_char.insert(0, '<S>')
    
    if len(tweet_char) < max_tweet_len:
        tweet_char.extend(repeat('</S>', max_tweet_len - len(tweet_char)))
    padded_tweets_val.append(tweet_char)

padded_tweets_val= np.array(padded_tweets_val)


In [15]:
batch_size = 32
max_tweet_len = 161
og_size = []
padded_tweets_test = []

      
for i in range(0, len(test)):
    
    tweet = test[i][1]
    og_size.append(len(tweet))
    tweet_char = list(tweet)
    tweet_char.insert(0, '<S>')
    
    if len(tweet_char) < max_tweet_len:
        tweet_char.extend(repeat('</S>', max_tweet_len - len(tweet_char)))
    padded_tweets_test.append(tweet_char)

padded_tweets_test= np.array(padded_tweets_test)

In [16]:
# padded = np.load('padded.npz')
# padded_tweets = padded['tweets']

padded_tweets_test.shape

(14960, 161)

In [28]:
char_idx = {}
lang_idx = {'en': 0, 'es': 1, 'pt':2, 'gl' :3, 'eu': 4, 
         'ca' :5, 'fr' : 6, 'it': 7, 'de':8}

for idx,k in enumerate(vocab_):
    char_idx[k] = idx


In [24]:
X = []
ln_idx = []
y = []

for i in range(0, len(train)):
    
    char_list = padded_tweets[i]
    #print(len(char_list))
    ln = train[i][0]
    seq1 = []
    seq2 = []
    seq3 = []
    
    for i in range(0, len(char_list)):
        
        if char_list[i] not in char_idx.keys():
            
            char_id = char_idx['Oov']
        else:
            char_id = char_idx[char_list[i]]
            
            
        
        dat1 = char_id        
        seq1.append(dat1)
    seq3.append(lang_idx[ln])

    
    X.append(seq1)
    ln_idx.append(seq3)
        
     
        

NameError: name 'padded_tweets' is not defined

In [37]:
X = np.array(X); y = np.array(y); ln_idx = np.array(ln_idx)
#X = np.reshape(X, (len(X),))
ln_idx.shape
# for i in range(0, len(X)):
    
#     if len(X[i]) != max_tweet_len:
#         print(i, len(X[i]))

(80175, 1)

In [23]:
print(ln_idx.shape)
np.savez('train_data.npz', X = X,  y= ln_idx)


NameError: name 'ln_idx' is not defined

In [48]:
data = np.load('train_data.npz')
X = data['X']
y = data['y']


In [29]:
X_val = []
ln_idx_val = []


for i in range(0, len(val)):
    
    char_list = padded_tweets_val[i]
    #print(len(char_list))
    ln = train[i][0]
    seq1 = []
    seq2 = []
    seq3 = []
    
    for i in range(0, len(char_list)):
        
        if char_list[i] not in char_idx.keys():
            
            char_id = char_idx['Oov']
        else:
            char_id = char_idx[char_list[i]]
            
            
        
        dat1 = char_id        
        seq1.append(dat1)
    seq3.append(lang_idx[ln])

    
    X_val.append(seq1)
    ln_idx_val.append(seq3)
        
     
        

In [30]:
X_val = np.array(X_val); ln_idx_val = np.array(ln_idx_val)
print(ln_idx_val.shape)
print(X_val.shape)
np.savez('val_data.npz', X = X_val,  y= ln_idx_val)

(11759, 1)
(11759, 161)


In [48]:
X_val[5]
(val[5][1])

'Viva la operación bikini ✌️ @coquetaLgancia'

In [43]:
max_tweet_len = 161
og_size = []
padded_tweets_test = []

      
for i in range(0, len(test)):
    
    tweet = test[i][1]
    og_size.append(len(tweet))
    tweet_char = list(tweet)
    tweet_char.insert(0, '<S>')
    
    if len(tweet_char) < max_tweet_len:
        tweet_char.extend(repeat('</S>', max_tweet_len - len(tweet_char)))
    padded_tweets_test.append(tweet_char)

padded_tweets_test= np.array(padded_tweets_test)
padded_tweets_test.shape

(14960, 161)

In [18]:
X_test = []
ln_idx_test = []


for i in range(0, len(test)):
    
    char_list = padded_tweets_test[i]
    #print(len(char_list))
    ln = train[i][0]
    seq1 = []
    seq2 = []
    seq3 = []
    
    for i in range(0, len(char_list)):
        
        if char_list[i] not in char_idx.keys():
            
            char_id = char_idx['Oov']
        else:
            char_id = char_idx[char_list[i]]
            
            
        
        dat1 = char_id        
        seq1.append(dat1)
    seq3.append(lang_idx[ln])

    
    X_test.append(seq1)
    ln_idx_test.append(seq3)
        
     
        

In [19]:
X_test = np.array(X_test); ln_idx_test = np.array(ln_idx_test)
print(ln_idx_test.shape)
print(X_test.shape)
np.savez('test_data.npz', X = X_test,  y= ln_idx_test)

(14960, 1)
(14960, 161)


NameError: name 'X_val' is not defined

In [54]:
len_vocab = len(vocab_)
len_training = len(train)
num_lang = 9


Vocab = Input(name = 'vocab', shape = [max_tweet_len])

char_embedding =  Embedding(name = 'char_embedding', input_dim = len_vocab, output_dim = 10)(Vocab)

reshaped_out = Reshape((161,10, 1))(char_embedding)
#print(reshaped_out.shape)

l1 = Conv2D(64, 4, activation='relu')(reshaped_out)
l2 = MaxPooling2D(pool_size=2)(l1)
#l3 = Dropout(0.25)(l2)
l3 = BatchNormalization()(l2)
l4 = Flatten()(l3)
#l5 = Dense(128, activation='relu')(l4)
out = Dense(num_lang, activation='softmax')(l4)




model = Model(inputs = Vocab, outputs=out)
model.summary()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vocab (InputLayer)           (None, 161)               0         
_________________________________________________________________
char_embedding (Embedding)   (None, 161, 10)           4940      
_________________________________________________________________
reshape_4 (Reshape)          (None, 161, 10, 1)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 158, 7, 64)        1088      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 79, 3, 64)         0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 79, 3, 64)         256       
_________________________________________________________________
flatten_3 (Flatten)          (None, 15168)             0         
__________

In [55]:
#in_ = {'vocab': X, 'lang': ln_idx}
history = model.fit(X, ln_idx, epochs = 100, batch_size=128, verbose=1, validation_data = (X_val, ln_idx_val))



Train on 80175 samples, validate on 11759 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100

KeyboardInterrupt: 

In [56]:
len(vocab_)

494