In [2]:
from __future__ import absolute_import
import tensorflow as tf
import numpy as np
import random
import math
import collections
import sqlite3
import tqdm

In [3]:
conn = sqlite3.connect('berita.db')
c    = conn.cursor()

In [4]:
c.execute("SELECT Text,Url FROM berita")
q = c.fetchall()

In [5]:
def cleanWhitespace(text):
    length = len(text)
    while True:
        text = text.replace("  "," ")
        if len(text) == length:
            break
        length = len(text)
    return text

In [6]:
text = [cleanWhitespace(I[0]) for I in q]
url  = [I[1] for I in q]

In [7]:
def createData(text,topWord):
    text = [I.split(" ") for I in text]
    counter = collections.Counter()
    for I in text:
        counter.update(I)
    w =  counter.most_common(topWord-1)
    for I in range(len(w)):
        if w[I][1] < 5:
            w = w[:I]
            break
    wordList= [I[0] for I in w]
    keyMapping = {}
    unk = 0
    for I in range(len(wordList)):
        keyMapping[wordList[I]] = len(keyMapping) + 1
    data = []
    for I in text:
        data.append([])
        for J in I:
            if J in keyMapping:
                data[-1].append(keyMapping[J])
            else:
                data[-1].append(0)
                unk += 1
    return data , w , unk

In [8]:
hasil , keys , unk= createData(text,50000)

In [9]:
total = 0
for I in hasil:
    total += len(I)
total

24131637

In [10]:
def discardProbability(p,count):
    threshold = 1 - (p/count)**(0.5)
    return threshold > random.random()

In [11]:
def constructData(data,p,keys,unk,space):
    x = []
    for I in data:
        # Undersampling
        left = []
        for J in I:
            if J == 0: #unknown
                if not discardProbability(p,unk):
                    left.append(J)
            else:
                if not discardProbability(p,keys[J-1][1]):
                    left.append(J)
        for J in range(len(left)):
            w = left[J:J+space]
            for K in range(1,len(w)):
                x.append(np.array([w[0],w[K]]))
                x.append(np.array([w[K],w[0]]))
    return np.array(x)

In [12]:
x = constructData(hasil,100,keys,unk,5)

In [13]:
len(x)

39048316

In [None]:
batch_size = 256
embedding_size = 200
vocabulary_size = len(keys) + 1
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

    with tf.device('/cpu:0'):
        embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm


    init = tf.global_variables_initializer()

In [None]:
class batchInput:
    def __init__(self,data,batchSize):
        self.data = data.copy()
        self.batchSize = batchSize
        assert self.batchSize * 2 < len(data)
        self.initData()
    def initData(self):
        self.index = 0
        random.shuffle(self.data)
    def call(self):
        if self.index + self.batchSize > len(self.data):
            self.initData()
        self.index += self.batchSize
        return self.data[self.index-self.batchSize:self.index]

In [None]:
num_steps = 100001

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
    init.run()
    print('Initialized')
    batch = batchInput(x,batch_size)
    average_loss = 0
    t = tqdm.tqdm(total=num_steps)
    for step in range(num_steps):
        n = batch.call()
        X = [I[0] for I in n]
        y = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        for I in range(len(n)):
            y[I,0] = n[I][1]
        feed_dict = {train_inputs: X, train_labels: y}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        t.update()
    final_embeddings = normalized_embeddings.eval()
    cc = embeddings.eval()

In [None]:
cc[1]

In [None]:
len(final_embeddings[0])

In [None]:
def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

try:
    # pylint: disable=g-import-not-at-top
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 200
    low_dim_embs = tsne.fit_transform(cc[:plot_only, :])
    labels = [keys[i][0] for i in range(plot_only)]
    plot_with_labels(low_dim_embs, labels)
except ImportError:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')

In [None]:
hasil[0].count(0)

In [None]:
def cosine_similarity(v1,v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/(sumxx*sumyy)*(0.5)

In [None]:
def sigmoid(x):
    return [256 // (1 + math.exp(-I)) for I in x]

In [None]:
def word2Vec(sentence):
    # Too lazy for lambdao
    return [sigmoid(cc[I]) for I in sentence]

In [None]:
def convertSentence(sentence):
    if len(sentence) > 200:
        while sentence.count(0) > 0 and len(sentence) > 200:
            sentence.remove(0)
        array = word2Vec(sentence)
        if len(array) > 200:
            cosList = []
            for I in range(len(array)-1):
                cosList.append(cosine_similarity(array[I],array[I+1]))
            while len(array) > 200:
                index = cosList.index(max(cosList))
                # The index and index + 1 arrows is the closest one
                # We take the max
                array[index] = [max(array[index][I],array[index+1][I]) for I in range(len(array[0]))]
                array.pop(index+1)
                cosList.pop(index)
                if index < len(cosList):
                    cosList[index] = cosine_similarity(array[index],array[index+1])
                if index > 0:
                    cosList[index-1]= cosine_similarity(array[index],array[index-1])
        return array        
    elif len(sentence) < 200:
        notZero = np.array(sentence) != 0
        index = [int(I) for I in range(len(notZero)) if sentence[I]==True]
        try:
            overSampled = random.sample(index,200 - len(sentence))
        except:
            return None
        overSampled.sort()
        for J in range(len(overSampled)-1,-1,-1):
            sentence.insert(overSampled[J],sentence[J])
        return word2Vec(sentence)
    else:
        return word2Vec(sentence)

In [None]:
from PIL import Image
import numpy as np

In [None]:
q = tqdm.tqdm_notebook(total=len(hasil))
for I in range(len(hasil)):
    result = convertSentence(hasil[I])
    if result != None:
        im = Image.fromarray(np.array(result),'L')
        im.save('img\\'+ url[I].split("/")[-1] + ".jpeg", "JPEG")
    q.update()

In [None]:
help(tqdm.tqdm_notebook)