In [None]:
import wikipedia
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 

### Getting text from any language from wikipedia

In [None]:
new_wiki_text = []
wikipedia.set_lang('tr')
for i in range(0, 5):
    print(i)
    random = wikipedia.random(1)
       
    try:
        new_wiki_text.append([wikipedia.page(random).summary])
    except wikipedia.exceptions.DisambiguationError as e:
        random = wikipedia.random(1)

In [None]:
new_wiki_text

### LabelEncoding 

In [None]:
Y = train_df['language'] # target column
encoder = LabelEncoder()
encoder.fit(Y)
Y = encoder.transform(Y)
Y = tf.keras.utils.to_categorical(Y, num_classes=4) #equals to the number of languages

In [None]:
train_df['sentence_lower'] = train_df["sentence"].str.lower()
train_df['sentence_no_punctuation'] = train_df['sentence_lower'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")

In [None]:
max_features = 5000 #we set maximum number of words to 5000
maxlen = 400        #we set maximum sequence length to 400

### Tokenization

In [None]:
tokens = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step

In [None]:
tokens.fit_on_texts(list(train_df['sentence_no_punctuation'])) #fit to cleaned text ## FOR ONLY TRAINING DATA  

In [None]:
print(len(tokens.word_index))
vocab_size = len(tokens.word_index) + 1  # +1 is for padding, 0 is assigned for padding
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

In [None]:
train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation'])) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split #divide into train and test set
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [None]:
embedding_dim = 50 #this is the final dimension of the embedding space.

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras.optimizers import SGD

### Simple Model Architecture

In [None]:
### Longer definition

model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, #embedding input
                           output_dim=embedding_dim,#embedding output
                           input_length=maxlen), #maximum length of an input sequence
  tf.keras.layers.Flatten(), #flatten layer

  tf.keras.layers.Dense(4, activation=tf.nn.softmax) #ouput layer a Dense layer with 4 probabilities
  #we also define our final activation function which is the softmax function typical for multiclass
  #classifiction problems

])

In [None]:
### Shorter definition

model = Sequential()
model.add(Embedding(input_dim=vocab_size,      #embedding input
                    output_dim=embedding_dim,  #embedding output
                    input_length=maxlen),      #maximum length of an input sequence
model.add(Flatten())
model.add(Dense(4, activation=tf.nn.softmax))

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy', #we recommend this loss function you
              metrics=['accuracy'])

In [None]:
model.summary() #here we show the architecture 

In [None]:
model.fit(np.array(X_train), np.array(y_train), epochs=3) #let's fit the model

### Evaluation

In [None]:
model.evaluate(np.array(X_test), np.array(y_test)) 

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix #we import this package from sklearn and output it
predictions = model.predict(X_test) #here we make predictions
cm = confusion_matrix(predictions.argmax(axis=1), y_test.argmax(axis=1))#we generate the confusion matrix

### Testing with Brand New Data

In [None]:
new_text = ["tensorflow es una gran herramienta puedes encontrar muchos tutoriales de packt"]

In [None]:
test_text = tok.texts_to_sequences(new_text) #this is how we create sequences
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=maxlen) #let's execute pad step

In [None]:
np.set_printoptions(suppress=True)
predictions = model.predict(test_text)
print(predictions.argmax())
print(predictions) #spanish you can get confused with italian which makes sense since they are more similar languages

## TOKENIZATION TRICK

For this you add a OOV (out-of-vocabulary) token in your vocabulary. This will be used whenever you don’t find a word in your vocabulary. 

In [None]:
import keras.preprocessing.text as kpt

In [19]:
num_words = 3
tk = kpt.Tokenizer(oov_token='UNK', num_words=num_words)
texts = ["my name is far faraway asdasd", "my name is", "your name is"]
tk.fit_on_texts(texts)
print(tk.word_index)
print(tk.texts_to_sequences(texts))
tk.word_index = {key:value for key, value in tk.word_index.items() if value <= num_words} # <= because tokenizer is 1 indexed
tk.word_index[tk.oov_token] = num_words + 1
print(tk.word_index)
print(tk.texts_to_sequences(texts))

{'UNK': 1, 'name': 2, 'is': 3, 'my': 4, 'far': 5, 'faraway': 6, 'asdasd': 7, 'your': 8}
[[1, 2, 1, 1, 1, 1], [1, 2, 1], [1, 2, 1]]
{'UNK': 4, 'name': 2, 'is': 3}
[[4, 2, 4, 4, 4, 4], [4, 2, 4], [4, 2, 4]]


In [20]:
tk.word_index['UNK']

4

#{'your': 7, 'my': 3, 'name': 1, 'far': 4, 'faraway': 5, 'is': 2, 'UNK': 8, 'asdasd': 6} 
#[[3, 1, 2], [3, 1, 2], [1, 2]]  ## Wrong Behavior. Should not drop OOVs

#{'name': 1, 'my': 3, 'is': 2, 'UNK': 4}
#[[3, 1, 2, 4, 4, 4], [3, 1, 2], [4, 1, 2]] ## Correct behavior

### Burada bir problem var, normalde oov_token'in son kelime olarak dictionary'e eklenmesi lazim

### Ama benim kodda ilk kelime olarak ekleniyor ve aslinda benim sozlugumdeki kelimelerin indexi 2'den basliyor.