In [17]:

import tensorflow as tf
import numpy as np
from keras.preprocessing.text import Tokenizer

import nltk
from nltk.corpus import brown

from keras.models import Sequential 
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten , Reshape
from keras.metrics import categorical_accuracy



nltk.download('brown')
data = brown.sents(categories=brown.categories())


print(len(data))

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
punctuation = ['!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','|','}','~','``',"''",'--']

sentences=[]

for sentence in data:
    for word in stopwords:
        token=" "+word+" "
        sentence=[item.replace(token," ") for item in sentence]
    sentences.append(sentence)

for i in range(len(sentences)):
    sentences[i]=[item.lower() for item in sentences[i]]
    for pun in punctuation:
        sentences[i]=[item.replace(pun,"") for item in sentences[i]]
        sentences[i]=[item for item in sentences[i] if item]
print(len(sentences))
vocab_size=10000
vector_dim=64
maxlen=20
window_size=3


tokenizer = Tokenizer(num_words = vocab_size,oov_token ="<oov>")
tokenizer.fit_on_texts(sentences)


word_index = tokenizer.word_index
for word , index in word_index.items():
  if index>10000:
    word_index[word]=1
    
  
    


x_train=[]
y_train=[]

for sentence in sentences:
  sent_len=len(sentence)
  for i , context in enumerate(sentence):
    
    for j in range(i - window_size, i + window_size + 1):
      if j != i and j <= sent_len-1 and j >= 0:
          x_train.append(word_index[sentence[i]])
          y_train.append(word_index[sentence[j]])   
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

print(x_train.shape)
print(y_train.shape)
print(x_train[:15])
print(y_train[:15])
model=Sequential()
model.add(Embedding(vocab_size + 1, vector_dim, input_length=1))
model.add(Flatten())
model.add(Dense(vector_dim,activation = 'relu'))
model.add(Dense(vocab_size+1, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()


num_epochs = 2


model.fit(x_train , y_train , epochs=num_epochs ,batch_size = 10000, verbose=1)






[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
57340
57340
(5402772,)
(5402772,)
[   2    2    2 5437 5437 5437 5437  643  643  643  643  643 2297 2297
 2297]
[5437  643 2297    2  643 2297 1634    2 5437 2297 1634   55    2 5437
  643]
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 1, 64)             640064    
_________________________________________________________________
flatten_7 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_12 (Dense)             (None, 10001)             650065    
Total params: 1,294,289
Trainable params: 1,294,289
Non-trainable params: 0
_______

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7f0141249be0>

In [18]:

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10001, 64)


In [0]:

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


In [0]:

import io

out_v = io.open('gajab.tsv', 'w', encoding='utf-8')
out_m = io.open('lajabab.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size+1):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [0]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('gajab.tsv')
  files.download('lajabab.tsv')