In [18]:
print("Num GPUs Available: ", len(tensorflow.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [1]:
import tensorflow 
import bz2
import numpy as np
import re
from tensorflow.keras.preprocessing import text, sequence

In [15]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.23.2-cp37-cp37m-win_amd64.whl (6.8 MB)
Note: you may need to restart the kernel to use updated packages.
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-0.23.2 threadpoolctl-2.1.0


In [2]:
from gensim.models import Word2Vec
# importing all necessary modules 
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gmsan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('test.ft.txt.bz2')

In [4]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [5]:
len(train_texts)

3600000

In [10]:
type(val_texts)

list

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

In [28]:
type(val_texts)

numpy.ndarray

In [11]:
max_features = 20000
maxlen = 100
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_texts)
tokenized_train = tokenizer.texts_to_sequences(train_texts)
tokenized_val = tokenizer.texts_to_sequences(val_texts)
tokenized_test = tokenizer.texts_to_sequences(test_texts)

In [12]:
del train_texts, val_texts, test_texts

In [13]:

X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)
X_val = sequence.pad_sequences(tokenized_val, maxlen=maxlen)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [14]:
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

#GloVes Load
EMBEDDING_DIM = 200
Glove = 'glove.6B.200d.txt'

glove2word2vec(glove_input_file=Glove, word2vec_output_file="gensim_glove_vectors.txt")
embeddings = gensim.models.KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(20000, EMBEDDING_DIM)) # +1 is because the matrix indices start with 0

for word, i in tokenizer.word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
    if embeddings_matrix.shape[0] == 20000:
        break 
        
del embeddings

In [15]:
embeddings_matrix.shape

(20000, 200)

In [16]:
from tensorflow.keras import models, layers, optimizers

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [20]:
def build_model():
    sequences = layers.Input(shape=(maxlen,))
    embedded = layers.Embedding(input_dim =embeddings_matrix.shape[0],output_dim = embeddings_matrix.shape[1],weights=[embeddings_matrix],trainable=False, input_length = maxlen)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    opt = optimizers.Adam(lr=0.0005)
    model.compile(
        optimizer=opt,
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [21]:
history = model.fit(
    X_train, 
    train_labels, 
    batch_size=1024,
    epochs=10,
     validation_data=(X_val, val_labels) )

Train on 2880000 samples, validate on 720000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
model.save('glove_model_200_0005.h5')

In [24]:
import pandas as pd
embeddings_matrix = pd.read_csv('skipgram_embeddings_matrix.csv')

In [25]:
model_skipgram = build_model()

In [26]:
history_skipgram = model_skipgram.fit(
    X_train, 
    train_labels, 
    batch_size=1024,
    epochs=5,
    validation_data=(X_val, val_labels))

Train on 2880000 samples, validate on 720000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
model_skipgram.save('skipgram_model_0005.h5')

In [29]:
results_skipgram = model_skipgram.evaluate(X_test, test_labels, batch_size=128)
print("results for skipgram ")
print("test loss, test acc:", results_skipgram)

results for skipgram 
test loss, test acc: [0.19700973466038704, 0.9223]
