In [2]:
import pandas as pd
import numpy as np
import json
import random
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.layers import Dropout
from tensorflow import keras
from keras import preprocessing
from keras import Model
import tensorflow as tf

# Uploading and preprocessing data

In [3]:
#lowercase and removing stopwords
def get_data(filename):
    with open(filename, 'r', encoding='unicode_escape') as f:
        data = f.read()
    data = data.lower()

    nltk.download('stopwords')
    stop_words = stopwords.words('english')
    data = data.split()
    data = [w for w in data if w not in stop_words]
    data = " ".join(data)

    return(data)

In [4]:
it_data = get_data('it_oriented.TXT')
non_it_data = get_data('ordinary.TXT')
all_data = it_data + non_it_data

all_data[:20]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'reason java still re'

In [5]:
#creating word index
def create_index(texts, filename):
    words = texts.split() 
    
    #there can be up to 50 000 words understood by our neural network
    tokenizer = Tokenizer(num_words=50000) 
    
    #we wil be tokenizing all of those words from the text files
    tokenizer.fit_on_texts(words) 
    sequences = tokenizer.texts_to_sequences(words)
    word_index = tokenizer.word_index

    print(f"Found {len(word_index)} unique words") 
    
    with open (filename, 'w') as f:
        json.dump(word_index, f, indent=4)

In [6]:
create_index(all_data, 'word_index.json')

Found 12121 unique words


In [7]:
#grab and return word index
def get_index(filename): #for opening that json file
    with open(filename, 'r') as f:
        data = json.load(f)
    return(data)

In [8]:
word_index = get_index('word_index.json')

In [9]:
#creating sentences
def create_sents(text):
    nltk.download('punkt')
    sentences = nltk.tokenize.sent_tokenize(text)
    return(sentences)

In [10]:
it_sents = create_sents(it_data)
non_it_sents = create_sents(non_it_data)

it_sents[0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'reason java still relevant shoved throats high schoolers college students.'

In [11]:
#padding
def padding_data(sentences, index, maxlen=25):
    new_sentences = []
    for sentence in sentences:
        #this will give us a sentence converted to numerical array
        sentence = text_to_word_sequence(sentence) 
        new_sentence = []
        words = []
        for word in sentence:
            try:
                word = index[word]
            except:
                KeyError
                #for unknown words we encounter:
                word = 0 
            words.append(word)
        new_sentence.append(words)
        new_sentence = preprocessing.sequence.pad_sequences(new_sentence, maxlen=maxlen, padding='post')
        new_sentences.append(new_sentence[0])
    return(new_sentences)

In [12]:
it_padded = padding_data(it_sents, word_index, maxlen=25)
non_it_padded = padding_data(non_it_sents, word_index, maxlen=25)

it_padded[0]

array([ 359,  125,   79, 1165, 6041, 6042,  131, 6043,  849,  278,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int32)

In [13]:
#reverse word index
def reverse_index(word_index):
    reverse_word_index = {value: key for (key, value) in word_index.items()}
    return(reverse_word_index)
    
def reconst_text(text, reverse_word_index):
    return(" ".join([reverse_word_index.get(i, "?") for i in text]))

In [14]:
reverse_word_index = reverse_index(word_index)

reconst_text(it_padded[0], reverse_word_index)

'reason java still relevant shoved throats high schoolers college students ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?'

# Labelling Data

In [15]:
def label_data(sentences, label):
    total_chunks = []
    for sentence in sentences:
        #in form of a tuple that consists of a list of numbers and a label
        total_chunks.append((sentence, label)) 
    return(total_chunks)

In [16]:
non_it_labelled = label_data(non_it_padded, 0)
it_labelled = label_data(it_padded, 1)

non_it_labelled[0]

(array([2779, 1803, 8427, 8428,  464, 5199, 1641,  649, 5200, 5201,  393,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0], dtype=int32), 0)

# Creating Training Data

In [17]:
def create_training(total_chunks, cutoff):
    random.shuffle(total_chunks)
    training_data = []
    training_labels = []
    testing_data = []
    testing_labels = []
    test_num = len(total_chunks) * cutoff
    x = 0
    
    for entry in total_chunks:
        if x > test_num:
            testing_data.append(entry[0])
            testing_labels.append(entry[1])
        else:
            training_data.append(entry[0])
            training_labels.append(entry[1])
        x = x + 1

    #we are converting all that into a numpy array
    training_data = np.array(training_data)
    #training_data = training_data[np.newaxis, :]
    training_labels = np.array(training_labels)
    testing_data = np.array(testing_data)
    #testing_data = testing_data[np.newaxis, :]
    testing_labels = np.array(testing_labels)
    
    return(training_data, training_labels, testing_data, testing_labels)

In [18]:
all_data = it_labelled + non_it_labelled
tt_data = create_training(all_data, cutoff = 0.8)

In [19]:
tt_data

(array([[ 660,  827, 2036, ...,    0,    0,    0],
        [ 126,   69,  195, ...,    0,    0,    0],
        [ 403, 9365, 9366, ...,    0,    0,    0],
        ...,
        [   7,  153,   37, ...,    0,    0,    0],
        [   8,  238,  142, ..., 3548,   80,    0],
        [ 214,   87, 2474, ...,  251,  127, 2980]], dtype=int32),
 array([0, 0, 0, ..., 0, 1, 1]),
 array([[  426,  4010,  2332, ...,  2794,  1783,  1817],
        [  875,  1087,    52, ...,     0,     0,     0],
        [   21,  1935,   808, ...,     0,     0,     0],
        ...,
        [ 3418,    48,  1140, ...,     0,     0,     0],
        [  419,   512,   149, ...,     0,     0,     0],
        [ 2841,  5703, 11345, ...,     0,     0,     0]], dtype=int32),
 array([0, 0, 1, ..., 1, 0, 0]))

In [20]:
#create model
def create_model():
  model = keras.Sequential()
  model.add(keras.layers.Embedding(20000, 25))
  model.add(keras.layers.GlobalAveragePooling1D())
  model.add(keras.layers.Dense(32, activation='relu'))
  model.add(Dropout(0.1))
  model.add(keras.layers.Dense(32, activation='relu'))
  model.add(Dropout(0.1))
  model.add(keras.layers.Dense(16, activation='tanh'))
  model.add(Dropout(0.1))
  model.add(keras.layers.Dense(1, activation='sigmoid'))
  model.summary()
  model.compile(optimizer='adam', loss='squared_hinge', metrics=['accuracy'])
  return(model)

#calling the function
model = create_model()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 25)          500000    
                                                                 
 global_average_pooling1d (G  (None, 25)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                832       
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                        

In [21]:
#train model
def train_model(model, tt_data, val_size=.1, epochs=1, batch_size=32):
    vals = int(len(tt_data[0])*val_size)
    training_data = tt_data[0]
    training_labels = tt_data[1]
    testing_data = tt_data[2]
    testing_labels = tt_data[3]
    
    x_val = training_data[:vals]
    x_train = training_data[vals:]
    
    y_val = training_labels[:vals]
    y_train = training_labels[vals:]
    
    fitModel = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val), verbose=1, shuffle=True)
    model_results = model.evaluate(testing_data, testing_labels)
    return model
    
#calling the function
model = train_model(model, tt_data=tt_data, epochs=5, batch_size=32)
model.save('models/it_or_not.model')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: models/it_or_not.model/assets
