In [1224]:
import pandas as pd
import numpy as np
import json
import random
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.layers import Dropout
from tensorflow import keras
from keras import preprocessing

# Uploading and preprocessing data

In [1225]:
#lowercase and removing stopwords
def get_data(filename):
    with open(filename, 'r', encoding='unicode_escape') as f:
        data = f.read()
    data = data.lower()

    nltk.download('stopwords')
    stop_words = stopwords.words('english')
    data = data.split()
    data = [w for w in data if w not in stop_words]
    data = " ".join(data)

    return(data)

In [1226]:
it_data = get_data('it_oriented.TXT')
non_it_data = get_data('ordinary.TXT')
all_data = it_data + non_it_data

all_data[:20]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'reason java still re'

In [1227]:
#creating word index
def create_index(texts, filename):
    words = texts.split() 
    
    #there can be up to 50 000 words understood by our neural network
    tokenizer = Tokenizer(num_words=50000) 
    
    #we wil be tokenizing all of those words from the text files
    tokenizer.fit_on_texts(words) 
    sequences = tokenizer.texts_to_sequences(words)
    word_index = tokenizer.word_index

    print(f"Found {len(word_index)} unique words") 
    
    with open (filename, 'w') as f:
        json.dump(word_index, f, indent=4)

In [1228]:
create_index(all_data, 'word_index.json')

Found 12121 unique words


In [1229]:
#grab and return word index
def get_index(filename): #for opening that json file
    with open(filename, 'r') as f:
        data = json.load(f)
    return(data)

In [1230]:
word_index = get_index('word_index.json')

In [1231]:
#creating sentences
def create_sents(text):
    nltk.download('punkt')
    sentences = nltk.tokenize.sent_tokenize(text)
    return(sentences)

In [1232]:
it_sents = create_sents(it_data)
non_it_sents = create_sents(non_it_data)

it_sents[0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'reason java still relevant shoved throats high schoolers college students.'

In [1233]:
#padding
def padding_data(sentences, index, maxlen=25):
    new_sentences = []
    for sentence in sentences:
        #this will give us a sentence converted to numerical array
        sentence = text_to_word_sequence(sentence) 
        new_sentence = []
        words = []
        for word in sentence:
            try:
                word = index[word]
            except:
                KeyError
                #for unknown words we encounter:
                word = 0 
            words.append(word)
        new_sentence.append(words)
        new_sentence = preprocessing.sequence.pad_sequences(new_sentence, maxlen=maxlen, padding='post')
        new_sentences.append(new_sentence[0])
    return(new_sentences)

In [1234]:
it_padded = padding_data(it_sents, word_index, maxlen=25)
non_it_padded = padding_data(non_it_sents, word_index, maxlen=25)

it_padded[0]

array([ 359,  125,   79, 1165, 6041, 6042,  131, 6043,  849,  278,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int32)

In [1235]:
#reverse word index
#def reverse_index(word_index):
 #   reverse_word_index = {value: key for (key, value) in word_index.items()}
  #  return(reverse_word_index)
    
#def reconst_text(text, reverse_word_index):
 #   return(" ".join([reverse_word_index.get(i, "?") for i in text]))

In [1236]:
#reverse_word_index = reverse_index(word_index)

#reconst_text(it_padded[0], reverse_word_index)

# Labelling Data

In [1237]:
def label_data(sentences, label):
    total_chunks = []
    for sentence in sentences:
        #in form of a tuple that consists of a list of numbers and a label
        total_chunks.append((sentence, label)) 
    return(total_chunks)

In [1238]:
non_it_labelled = label_data(non_it_padded, 0)
it_labelled = label_data(it_padded, 1)

non_it_labelled[0]

(array([2779, 1803, 8427, 8428,  464, 5199, 1641,  649, 5200, 5201,  393,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0], dtype=int32), 0)

# Creating Training Data

In [1239]:
def create_training(total_chunks, cutoff):
    random.shuffle(total_chunks)
    training_data = []
    training_labels = []
    testing_data = []
    testing_labels = []
    test_num = len(total_chunks) * cutoff
    x = 0
    
    for entry in total_chunks:
        if x > test_num:
            testing_data.append(entry[0])
            testing_labels.append(entry[1])
        else:
            training_data.append(entry[0])
            training_labels.append(entry[1])
        x = x + 1

    #we are converting all that into a numpy array
    training_data = np.array(training_data) 
    training_labels = np.array(training_labels)
    testing_data = np.array(testing_data)
    testing_labels = np.array(testing_labels)
    
    return(training_data, training_labels, testing_data, testing_labels)

In [1240]:
all_data = it_labelled + non_it_labelled
tt_data = create_training(all_data, cutoff = 0.8)

In [1241]:
tt_data

(array([[ 325,  276,  140, ...,    0,    0,    0],
        [  27,    9,  888, ...,    0,    0,    0],
        [ 845,  636,  170, ...,    0,    0,    0],
        ...,
        [   7,   51, 7208, ...,    0,    0,    0],
        [ 159, 5142,   30, ...,    0,    0,    0],
        [ 115,   20,   14, ...,    0,    0,    0]], dtype=int32),
 array([1, 0, 0, ..., 1, 1, 1]),
 array([[ 635,   64,  756, ...,    0,    0,    0],
        [3493, 2064, 2202, ...,    0,    0,    0],
        [  76,   85,   58, ...,    0,    0,    0],
        ...,
        [  18,   43,   70, ...,    0,    0,    0],
        [ 576, 1574,    0, ...,    0,    0,    0],
        [ 131,  404, 5740, ...,  925,  834, 1190]], dtype=int32),
 array([0, 0, 1, ..., 1, 0, 0]))

In [1242]:
#create model
def create_model():
    model = keras.Sequential()
    model.add(keras.layers.Embedding(20000, 25))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(32, activation='relu'))
    #model.add(Dropout(0.1))
    model.add(keras.layers.Dense(32, activation='relu'))
    #model.add(Dropout(0.1))
    model.add(keras.layers.Dense(16, activation='relu'))
    #model.add(Dropout(0.1))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.summary()
    model.compile(optimizer='adam', loss='squared_hinge', metrics=['accuracy'])
    return(model)

#calling the function
model = create_model()

Model: "sequential_61"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_61 (Embedding)    (None, None, 25)          500000    
                                                                 
 global_average_pooling1d_61  (None, 25)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_238 (Dense)           (None, 32)                832       
                                                                 
 dense_239 (Dense)           (None, 32)                1056      
                                                                 
 dense_240 (Dense)           (None, 16)                528       
                                                                 
 dense_241 (Dense)           (None, 1)                 17        
                                                     

In [1243]:
#train model
def train_model(model, tt_data, val_size=.1, epochs=1, batch_size=32):
    vals = int(len(tt_data[0])*val_size)
    training_data = tt_data[0]
    training_labels = tt_data[1]
    testing_data = tt_data[2]
    testing_labels = tt_data[3]
    
    x_val = training_data[:vals]
    x_train = training_data[vals:]
    
    y_val = training_labels[:vals]
    y_train = training_labels[vals:]
    
    fitModel = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val), verbose=1, shuffle=True)
    model_results = model.evaluate(testing_data, testing_labels)
    
#calling the function
model = train_model(model, tt_data=tt_data, epochs=7, batch_size=32)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
