In [1]:
dbutils.fs.mv("dbfs:/FileStore/tables/glove_6B_100d-15091.txt","file:/databricks/driver/glove.6B.100d.txt")

In [2]:
###################################### Prepare data for training Model #####################################

In [3]:
import pyspark
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text as txt
from numpy import asarray,array
from numpy import zeros

Lreviewdata_full = spark.read.parquet("/mnt/etl/parquet/labeled/Lreviewdata")
Lreviewdata = Lreviewdata_full.select("Text","label")
Lreviewdata.registerTempTable("DF")
Lreviewdata.cache()


# Training and testing split
train_0, test_0 = Lreviewdata.filter('label = 0.0').randomSplit([0.7, 0.3])
train_1, test_1 = Lreviewdata.filter('label = 1.0').randomSplit([0.7, 0.3])
test = test_0.union(test_1)
train = train_0.union(train_1)
traindataset = spark.createDataFrame(train.rdd.repartition(8))
testdataset = spark.createDataFrame(test.rdd.repartition(8))

sqlContext.clearCache()
traindataset.cache()
testdataset.cache()


l = traindataset.select('Text').rdd.map(lambda x: x[0]).collect()
m = testdataset.select('Text').rdd.map(lambda x: x[0]).collect()

tk = txt.Tokenizer(split= " ")   #tokenizes  
tk.fit_on_texts(l)
tk_test = txt.Tokenizer(split= " ")   #tokenizes  
tk_test.fit_on_texts(m)

#create train data set, both features and labels
x = tk.texts_to_sequences(l)      # converts the text to numbers
y = traindataset.select('label').rdd.map(lambda x: x[0]).collect()
#create test data set, both features and labels
x_test = tk_test.texts_to_sequences(m)      # converts the text to numbers
y_test = testdataset.select('label').rdd.map(lambda x: x[0]).collect()

# assign attributes to variables for consistency
max_features = 20000   # the more the better
max_length = 100  # cut texts after this number of words (arbitray)

# pad the train sequences so all arrays of same length
x = sequence.pad_sequences(x, maxlen = max_length, padding = 'post')  
# pad the test sequences so all arrays of same length
x_test = sequence.pad_sequences(x_test, maxlen = max_length, padding = 'post')  

vocab_size = len(tk.word_index) + 1

# load the whole embedding into memory
embeddings_index = dict()
f = open('/databricks/driver/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in tk.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector



In [4]:
######################################  Training with Dense layers ######################################

In [5]:
############################# Build Dense deep layer model ##############################

from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding

# define model
densemodel = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=100, trainable=False)
densemodel.add(e)
densemodel.add(Flatten())
densemodel.add(Dense(1, activation='sigmoid'))
# compile the model
densemodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(densemodel.summary())
# fit the model
densemodel.fit(x, y, validation_data=(x_test, y_test),epochs=2, verbose=0)

# Final evaluation of the model
scores = densemodel.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [6]:
dbutils.fs.mkdirs("/mnt/TrainedModels/")

In [7]:
densemodel.save("/databricks/driver/Fakereviews_Deeplearning_dense_model.h5")
dbutils.fs.mv("file:///databricks/driver/Fakereviews_Deeplearning_dense_model.h5","dbfs:/mnt/TrainedModels/Fakereviews_Deeplearning_dense_model.h5")

In [8]:
###################################### Training with CNN model ######################################

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

# define model
CNNmodel = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=100, trainable=False)
CNNmodel.add(e)
CNNmodel.add(Convolution1D(nb_filter=32, filter_length=10, border_mode='same', activation='relu'))
CNNmodel.add(MaxPooling1D(pool_length=4))       # pooling (max) after convoluting
CNNmodel.add(Flatten())
CNNmodel.add(Dense(500, activation='relu'))     # relu and sigmoid
CNNmodel.add(Dense(1, activation='sigmoid'))
CNNmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(CNNmodel.summary())

# Fit the model
CNNmodel.fit(x, y, validation_data=(x_test, y_test), nb_epoch=3, batch_size=20, verbose=1)   # batch size powers of two
# Final evaluation of the model
scores = CNNmodel.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [10]:
CNNmodel.save("/databricks/driver/Fakereviews_Deeplearning_CNN_model.h5")
dbutils.fs.mv("file:///databricks/driver/Fakereviews_Deeplearning_CNN_model.h5","dbfs:/mnt/TrainedModels/Fakereviews_Deeplearning_CNN_model.h5")

In [11]:
###################################### Training with RNN LSTM Model ######################################

In [12]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,LSTM, Dense, Dropout, Activation
from keras.layers import Flatten
from keras.layers import Embedding

# define model
LSTMmodel = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=100, trainable=False)
LSTMmodel.add(e)
LSTMmodel.add(LSTM(160, dropout=0.15, recurrent_dropout=0.15))
LSTMmodel.add(Dense(units=130, kernel_initializer='uniform', activation='relu'))
#model.add(Dense(units=80, kernel_initializer='uniform', activation='relu'))
LSTMmodel.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
LSTMmodel.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(LSTMmodel.summary())

# Fit the model
LSTMmodel.fit(x, y, validation_data=(x_test, y_test), nb_epoch=3, batch_size=50, verbose=1)   # batch size powers of two
# Final evaluation of the model
scores = LSTMmodel.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [13]:
print("Accuracy: %.2f%%" % (scores[1]*100))

In [14]:
LSTMmodel.save("/databricks/driver/Fakereviews_Deeplearning_LSTM_model.h5")
dbutils.fs.mv("file:///databricks/driver/Fakereviews_Deeplearning_LSTM_model.h5","dbfs:/mnt/TrainedModels/Fakereviews_Deeplearning_LSTM_model.h5")

In [15]:
%fs ls /mnt/TrainedModels/

path,name,size
dbfs:/mnt/TrainedModels/Fakereviews_Deeplearning_CNN_model.h5,Fakereviews_Deeplearning_CNN_model.h5,74379536
dbfs:/mnt/TrainedModels/Fakereviews_Deeplearning_LSTM_model.h5,Fakereviews_Deeplearning_LSTM_model.h5,70679552
dbfs:/mnt/TrainedModels/Fakereviews_Deeplearning_dense_model.h5,Fakereviews_Deeplearning_dense_model.h5,69292568
