In [1]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer, Tokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row
from pyspark.streaming.kafka import KafkaUtils
import pyspark.sql.functions as f
import json
import re
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
import sys
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import from_json
from pyspark.sql.functions import lit

import pickle
from tensorflow import keras
import tensorflow as tf
import numpy as np
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from nltk.corpus import brown
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
print(tf.__version__)
print(keras.__version__)

2.4.4
2.4.0


In [3]:
csi_pos_neg = spark.read.csv("hdfs:///user/spark/datafile/csiposneg.csv", header=True)
csi_pos_neg = csi_pos_neg.withColumn("label",col("label").cast("integer"))

In [4]:
t1 = csi_pos_neg.collect()[:24939]
t2 = csi_pos_neg.collect()[24939:49878]
t3 = csi_pos_neg.collect()[49878:74817]
t4 = csi_pos_neg.collect()[74817:99756]

t5 = csi_pos_neg.collect()[99756:124695]
t6 = csi_pos_neg.collect()[124695:149634]
t7 = csi_pos_neg.collect()[149634:174573]
t8 = csi_pos_neg.collect()[174573:199512]

t1 = spark.createDataFrame(t1)
t2 = spark.createDataFrame(t2)
t3 = spark.createDataFrame(t3)
t4 = spark.createDataFrame(t4)

t5 = spark.createDataFrame(t5)
t6 = spark.createDataFrame(t6)
t7 = spark.createDataFrame(t7)
t8 = spark.createDataFrame(t8)

first_df = t1.union(t5).union(t2).union(t6) # 99756 
second_df = t3.union(t7).union(t4).union(t8) # 99756 

In [5]:
# 우리가 해야할 일은 dataframe을 순서통일해서 text와 label 리스트로 나누는것
samples = first_df.select("text").rdd.flatMap(lambda x: x).collect()
labels = first_df.select("label").rdd.flatMap(lambda x: x).collect()

In [6]:
# Shuffle the data
seed = 1337 # 난수 생성시 사용되는 시드number
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

In [7]:
# only consider the top 20,000 words, and will truncate or pad sequences to be actually 100 tokens long

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [8]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [9]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [14]:
voc

['',
 '[UNK]',
 'the',
 'to',
 'in',
 'a',
 'and',
 'of',
 'for',
 'on',
 'is',
 'at',
 'you',
 'security',
 'this',
 'i',
 'with',
 'cybersecurity',
 'our',
 'it',
 'new',
 'from',
 'job',
 'my',
 'your',
 'be',
 'hacker',
 'by',
 'are',
 'we',
 'that',
 'just',
 'like',
 'have',
 'an',
 'day',
 'its',
 'us',
 'im',
 'can',
 'today',
 'all',
 'as',
 'out',
 'link',
 'how',
 'click',
 'not',
 'cve',
 'about',
 'vulnerability',
 'was',
 'see',
 'up',
 'more',
 'data',
 'bio',
 'if',
 'hacking',
 'posted',
 'what',
 'photo',
 'now',
 'time',
 'rt',
 'so',
 'will',
 'securityaffairs',
 'amp',
 'one',
 'but',
 'th',
 'has',
 'latest',
 'bitcoin',
 'great',
 'me',
 'or',
 'malware',
 'work',
 'apply',
 'were',
 'dont',
 'cyber',
 'c',
 'via',
 'am',
 'do',
 'infosec',
 'hiring',
 'get',
 'free',
 'love',
 'want',
 'pm',
 'when',
 'some',
 'good',
 'no',
 'been',
 'open',
 'windows',
 'week',
 'happy',
 'people',
 'back',
 'jobs',
 'looking',
 'remote',
 'news',
 'here',
 'could',
 'm',
 'bl

In [10]:
type(voc)

list

In [9]:
word_index

{'': 0,
 '[UNK]': 1,
 'the': 2,
 'to': 3,
 'in': 4,
 'a': 5,
 'and': 6,
 'of': 7,
 'for': 8,
 'on': 9,
 'is': 10,
 'at': 11,
 'you': 12,
 'security': 13,
 'this': 14,
 'i': 15,
 'with': 16,
 'cybersecurity': 17,
 'our': 18,
 'it': 19,
 'new': 20,
 'from': 21,
 'job': 22,
 'my': 23,
 'your': 24,
 'be': 25,
 'hacker': 26,
 'by': 27,
 'are': 28,
 'we': 29,
 'that': 30,
 'just': 31,
 'like': 32,
 'have': 33,
 'an': 34,
 'day': 35,
 'its': 36,
 'us': 37,
 'im': 38,
 'can': 39,
 'today': 40,
 'all': 41,
 'as': 42,
 'out': 43,
 'link': 44,
 'how': 45,
 'click': 46,
 'not': 47,
 'cve': 48,
 'about': 49,
 'vulnerability': 50,
 'was': 51,
 'see': 52,
 'up': 53,
 'more': 54,
 'data': 55,
 'bio': 56,
 'if': 57,
 'hacking': 58,
 'posted': 59,
 'what': 60,
 'photo': 61,
 'now': 62,
 'time': 63,
 'rt': 64,
 'so': 65,
 'will': 66,
 'securityaffairs': 67,
 'amp': 68,
 'one': 69,
 'but': 70,
 'th': 71,
 'has': 72,
 'latest': 73,
 'bitcoin': 74,
 'great': 75,
 'me': 76,
 'or': 77,
 'malware': 78,
 'work'

In [9]:
import csv

def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed
glove_path = "/root/spark/glove.6B.100d.txt"
pre_glove = glove2dict(glove_path)

In [9]:
# string_input = keras.Input(shape=(1,), dtype="string")
# x = vectorizer(string_input)

In [9]:
import subprocess

subprocess.check_call(["python3", "newGlove.py"]) # should exit with status 0

0

## 따로 스파크의 스레드 생성해서 parallize해서 몇초간격으로 실행되도록 구현해야할듯..

In [None]:
###################################################################################

In [11]:
#pre_glove와 embeddings_index는 같음
sw = list(stop_words.ENGLISH_STOP_WORDS)
brown_data = brown.words()[:200000]
brown_nonstop = [token.lower() for token in brown_data if (token.lower() not in sw)]
oov = [token for token in brown_nonstop if token not in pre_glove.keys()]

In [12]:
# brown_data를 sample dataset으로 사용함 (continuous fine tunning을 위한)
brown_data

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [13]:
oov #이거를 생성할때 일정 frequency 미만인 단어는 지우는 로직을 구현하자

["atlanta's",
 'term-end',
 'presentments',
 'mayor-nominate',
 "georgia's",
 "georgia's",
 "department's",
 "ordinary's",
 "court's",
 'unmeritorious',
 "atlanta's",
 "commissioner's",
 "mayor's",
 "wife's",
 "mayor's",
 "mayor's",
 "governor's",
 "byrd's",
 "caldwell's",
 '$100',
 '$30',
 "georgia's",
 '$3',
 '$4',
 '$50',
 "body's",
 '$10',
 '87-31',
 "georgia's",
 "saturday's",
 "didn't",
 "didn't",
 "wasn't",
 "daniel's",
 "taxpayers'",
 '$451,500',
 '$157,460',
 "year's",
 '$88,000',
 "berry's",
 "we're",
 "i'm",
 'ex-gambler',
 "department's",
 '$1,000',
 '$12',
 "formby's",
 "couldn't",
 "texas'",
 '$5,000,000',
 '$15,000,000',
 "cotten's",
 "bill's",
 "boy's",
 "year's",
 "master's",
 'co-signers',
 "school's",
 "it's",
 "russia's",
 "karns'",
 'subpenas',
 'altho',
 'subpenaed',
 '$37',
 '$37',
 '$4,800',
 '$5,000',
 '$10',
 'hospital-care',
 '$20',
 "taxpayers'",
 "wouldn't",
 "president's",
 "nation's",
 "can't",
 '$1,500',
 '$2,000',
 '$1,000',
 '$1,500',
 "children's",
 '

In [14]:
def get_rareoov(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]

In [15]:
oov_rare = get_rareoov(oov, 1) # ferquency 1미만인것 지움
corp_vocab = list(set(oov) - set(oov_rare))

In [16]:
brown_tokens = [token for token in brown_nonstop if token not in oov_rare]
brown_doc = [' '.join(brown_tokens)]

In [17]:
# corp_vocab = list(set(oov))
# brown_doc = [' '.join(brown_nonstop)]

In [18]:
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X = cv.fit_transform(brown_doc)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

  self._set_arrayXarray(i, j, x)


In [19]:
from mittens import Mittens

In [20]:
mittens_model = Mittens(n=100, max_iter=1000) # n is embedding dimension
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict= pre_glove)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Iteration 1000: loss: 0.0037114229053258896

In [21]:
newglove = dict(zip(corp_vocab, new_embeddings))
f = open("repo_glove.pkl","wb")
pickle.dump(newglove, f)
f.close()

In [None]:
###################################################################################

In [10]:
f = open("repo_glove.pkl","rb")
newglove = pickle.load(f)

In [6]:
len(newglove)

400192

In [12]:
pre_glove.update(newglove)

In [13]:
len(pre_glove)

400432

In [7]:
pre_glove['the']

NameError: name 'pre_glove' is not defined

In [10]:
len(newglove['the'])

100

In [11]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = newglove.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 16465 words (3535 misses)


In [32]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [33]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(2, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 100)         2000200   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, None, 128)         8204

In [20]:
# 노드 하나에서만 동작함
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=30, validation_data=(x_val, y_val))
# val_acc: 0.9483

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fcd5cd4f6a0>

In [34]:
# 노드 하나에서만 동작함
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)

In [35]:
from tensorflow.keras import models
model.load_weights("/root/spark/model/keras_modell_weights.h5")

In [36]:
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 100)         2000200   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, None, 128)         8204

In [14]:
# 우리가 해야할 일은 dataframe을 순서통일해서 text와 label 리스트로 나누는것
samples = second_df.select("text").rdd.flatMap(lambda x: x).collect()
labels = second_df.select("label").rdd.flatMap(lambda x: x).collect()

# Shuffle the data
seed = 1337 # 난수 생성시 사용되는 시드number
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

x_test = vectorizer(np.array([[s] for s in samples])).numpy()
y_test = np.array(labels)

In [37]:
results = model.evaluate(x_test, y_test, batch_size=128)



In [26]:
x_train[:3]

array([[ 4648,  1908,  4851, 17907,   961,     2, 10233,     7,   299,
            2, 10233,     7,   299,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [25]:
x_test[:3]

array([[ 4370,  3279,     8,    50,   685,   470,  2828,    77,     3,
          257,     6,   143,    27,  1518,     6,    91,    61,    12,
         2045,    77,     3,    91,     4,  3015,  1642,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [None]:
print("test loss, test acc:", results)

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, Flatten, Dense

In [None]:
# model = Sequential()
# model.add(Embedding(num_tokens, embedding_dim, input_length=200))
# model.add(Flatten())
# model.add(Dense(32, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
# model.summary()

# model.layers[0].set_weights([embedding_matrix])
# model.layers[0].trainable=False

In [None]:
# model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
# history = model.fit(x_train, y_train, epochs=2, batch_size=32, validation_data=(x_val, y_val))

In [23]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)

In [24]:
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["this message is about computer graphics and 3D modeling"]]
)

In [25]:
labels[np.argmax(probabilities[0])]

1