# Creating own DAN 

In [1]:
import os
import tensorflow as tf
# from keras.layers import Layer
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from typing import *
import pickle
import random
from nltk.tokenize import sent_tokenize

## Reading Data

In [2]:
data_dir = "../datasets/cleaned_datasets/filtered_dataset"

In [3]:
def read_data(_dir:str) -> pd.DataFrame:
    data = {}
    #data["url"] = []
    data["text"] = []
    data["class"] = []
    for root, dirs, files in os.walk(_dir):
        for _dir in dirs: 
            for txt_file in [x for x in os.listdir(os.path.join(root, _dir)) if x.endswith((".txt", ".TXT"))]:
                # Class name = dir name
                class_name = _dir
                #Read File
                file_name = os.path.abspath(os.path.join(root, _dir, txt_file))
                file = open(file_name, "r")
                txt = file.read()
                file.close()
                #data["url"].append(file_name)
                data["text"].append(txt)
                data["class"].append(class_name)
    df = pd.DataFrame.from_dict(data)
    del data
    return df

In [4]:
df = read_data(data_dir).sample(frac = 1)

In [5]:
df.head()

Unnamed: 0,text,class
537,a framework of multiclass sentiment classifica...,prediction
3345,the kmedoids clustering is very similar to kme...,clustering
7419,for example positive association rule high fre...,pattern_mining
9687,effective malicious sequential pattern mining ...,pattern_mining
1146,due to the large number of possible label sets...,prediction


In [6]:
# pd.get_dummies(df["class"])

## Creating One-Hot-Encodings

In [7]:
from sklearn import preprocessing

# Transform classes into dummies
le = preprocessing.LabelEncoder()

classes = df.drop(["text"], axis = 1)
classes.apply(le.fit_transform)

# Create One Hot Encodings
enc = preprocessing.OneHotEncoder()
enc.fit(classes)


one_hot_encodings = enc.transform(classes)

In [8]:
one_hot_encodings.shape

(11874, 3)

In [9]:
df.shape

(11874, 2)

In [10]:
one_hot_encodings.toarray()[0]

array([0., 0., 1.])

## Embeddings

In [11]:
class FastText():
    """
    Loads the FastText model and get the Vectors.
    """

    def __init__(self, **kwargs):
        # super().__init__(**kwargs)
        self.path = "/Users/Daniel/PycharmProjects/Recommender-System/notebooks/FastText/ft_model_15000.pkl"
        self.__initialize_model()

    def __initialize_model(self, **kwargs):
        try:
            tf.logging.info("FastText Model is loading")
            self.model = pickle.load(open(self.path, "rb"))
            tf.logging.info("FastText Model loaded!")
        except Exception as e:
            tf.logging.warning("Something went wrong while loading the FastText Model..")
            tf.logging.warning(e)

    def inference(self, words: List[str]) -> np.ndarray:
        embeddings = []
        for word in words:
            if self.model.wv.__contains__(word):
                embeddings.append(self.model.wv.__getitem__(word))
        return np.array(embeddings)


In [12]:
ft_path = "/Users/Daniel/PycharmProjects/Recommender-System/notebooks/FastText/ft_model_15000.pkl"
fast_text_model = FastText()

INFO:tensorflow:FastText Model is loading
INFO:tensorflow:FastText Model loaded!


## Creating Layer
We need: 
    1. Input Layer with Dropout
    2. Average Layer
    3. Dense Layer
    4. Dense Layer (Classifier)
    5. Output Softmax Layer

In [13]:
paragraphs = [["this", "sucks"], ["hello", "my", "name", "is", "daniel"],["clustering", "is", "amazing"]]
paragraphs_y = np.asarray([[0,1,0], [0,0,1], [1,0,0]])

In [14]:
input_layer = fast_text_model.inference(paragraphs[1])

In [15]:
input_layer.shape

(5, 100)

In [16]:
paragraphs_y.shape

(3, 3)

### Placeholder

In [17]:
x_input = tf.placeholder(dtype = tf.float64, shape = (None, 100), name = "placeholder_input")

In [18]:
y_true = tf.placeholder(dtype = tf.float64, shape = (None, 3), name = "placeholder_y_true")

### Costumized Layers

In [19]:
def dropout_layer(input_layer:np.ndarray ,dropout_prob:float = 0.2):
    #dist = tf.contrib.distributions.Binomial(1, self.drop)
    dist = 1 - np.random.binomial(1, dropout_prob, input_layer.shape[0])
    return input_layer[dist.astype(bool)]

In [20]:
dl = dropout_layer(input_layer)

In [21]:
def average_layer(dropout_layer:np.ndarray, placeholder: tf.placeholder,name= "average_layer"):
    with tf.name_scope(name):
        average = tf.reduce_mean(input_tensor=placeholder, axis=0)
        return tf.reshape(average, (1,-1), name = "average_layer")

In [22]:
average = average_layer(dl, x_input)

### Creating the Dense Layers for DAN

In [23]:
dense_layer_1 = tf.layers.dense(inputs=average,
                                      units=100,
                                      activation=tf.nn.tanh,
                                      use_bias=True,
                                      trainable=True,
                                      name="dense_layer_1")

In [24]:
dense_layer_2 = tf.layers.dense(inputs=dense_layer_1,
                                      units=100,
                                      activation=tf.nn.tanh,
                                      use_bias=True,
                                      trainable=True,
                                      name="dense_layer_2")

### Creating the Dense Layers for Clasification

In [25]:
dense_layer_3 = tf.layers.dense(inputs=dense_layer_2,
                                      units=200,
                                      activation=tf.nn.tanh,
                                      use_bias=True,
                                      trainable=True,
                                      name="dense_layer_3")

In [26]:
logits = tf.layers.dense(inputs=dense_layer_3,
                                      units=3,
                                      activation=tf.nn.tanh,
                                      use_bias=True,
                                      trainable=True,
                                      name="logits)

### Softmax Layer & Loss Function
https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits_v2

In [27]:
with tf.name_scope("loss"):
    softmax_layer = tf.nn.softmax_cross_entropy_with_logits_v2(labels = y_true, logits=logits) # WTF Is this function actually? 
    cross_entropy = tf.reduce_mean(softmax_layer)
    tf.summary.scalar("cross_entropy", cross_entropy)

### Optimizer

In [28]:
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
    train = optimizer.minimize(cross_entropy)
    

### Metrics - Accuracy

In [29]:
with tf.name_scope("accuracy"):
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y_true))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar("accuracy", accuracy)

## Saver

In [30]:
saver = tf.train.Saver()

In [31]:
save_path = "dan_checkpoints/" # Need to exist 

In [32]:
model_name = "first_dan.ckpt"

In [33]:
# Check the Graph
if not os.path.isdir(save_path):
    os.makedirs(save_path)

## Training

In [34]:
init = tf.global_variables_initializer()

In [44]:
with tf.Session() as sess: 
    sess.run(init)
    
    ### Tensorboard Stuff
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(save_path)
    writer.add_graph(sess.graph)

    epoches = 10
    for i in range(epoches):
        for j,p in enumerate(paragraphs): 
            # Converting paragraphs to Embeddings
            embeddings = dropout_layer(fast_text_model.inference(p))
            label = np.array(paragraphs_y[j]).reshape(1,-1)
            [_, train_accuracy, summary] = sess.run([train, accuracy, merged_summary], feed_dict={x_input:embeddings, y_true:label})
            writer.add_summary(summary = summary, global_step = i)
            writer.flush()
            #[train_accurcy, summary] = sess.run([accuracy, merged_summary], feed_dict={x_input:embeddings, y_true:label})
        print("Accuracy after Epoch: {}".format(train_accuracy))
    
    saver.save(sess, os.path.join(save_path, model_name))
    print("model_saved!")
    sess.close()

Accuracy after Epoch: 0.0
Accuracy after Epoch: 0.0
Accuracy after Epoch: 1.0
Accuracy after Epoch: 0.0
Accuracy after Epoch: 0.0
Accuracy after Epoch: 1.0
Accuracy after Epoch: 0.0
Accuracy after Epoch: 1.0
Accuracy after Epoch: 0.0
Accuracy after Epoch: 1.0
model_saved!


In [None]:
w

## Loading the Model

In [48]:
with tf.Session() as sess: 
    saver.restore(sess,os.path.join(save_path, model_name))
    
    # Is it predicting right? 
    embeddings = fast_text_model.inference(["this", "sucks"])
    print(sess.run(logits, feed_dict={x_input:embeddings}))
    
    #Getting new Embeddings
    print(sess.run(dense_layer_2, feed_dict={x_input:embeddings}))

INFO:tensorflow:Restoring parameters from dan_checkpoints/first_dan.ckpt
[[-0.16240382  0.39182154  0.06377319]]
[[ 0.03230377 -0.17810996  0.03260929 -0.14536984  0.29796564 -0.46223411
  -0.41599438  0.11585891  0.28831632  0.06492229  0.46638464  0.18014322
   0.12224294 -0.19163228  0.03007289  0.33980093  0.00281868  0.0332146
  -0.19013504 -0.16582564 -0.59048649 -0.14795333 -0.17244184  0.26569011
  -0.34479357 -0.1828127  -0.03142293 -0.20770107 -0.44474537 -0.00919966
   0.27196745 -0.07764005 -0.32889859  0.28723646  0.18341187  0.10273447
  -0.31061691 -0.02057754  0.10167686 -0.20434808  0.24811438 -0.25169096
  -0.06163547  0.00944988  0.3205502  -0.26943998 -0.41279809 -0.12671689
  -0.0460205   0.04608156  0.18463844  0.33069187 -0.19325399  0.26622016
  -0.37414399 -0.05859147 -0.28456428 -0.1796232  -0.44216661 -0.22569869
  -0.62489675  0.04729576  0.07861766  0.42960647  0.21874051  0.27542228
  -0.08337146  0.09598617 -0.2307338  -0.4421464  -0.20442049  0.25155391
