In [1]:
import pandas as pd
import numpy as np
import json
import random
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.layers import Dropout
from tensorflow import keras
from keras import preprocessing
from keras import Model
import tensorflow as tf

# Functions for preprocessing and turning into arrays

In [2]:
#creating sentences
def create_sents(text):
    nltk.download('punkt')
    sentences = nltk.tokenize.sent_tokenize(text)
    return(sentences)

#padding
def padding_data(sentences, index, maxlen=25):
    new_sentences = []
    for sentence in sentences:
        #this will give us a sentence converted to numerical array
        sentence = text_to_word_sequence(sentence) 
        new_sentence = []
        words = []
        for word in sentence:
            try:
                word = index[word]
            except:
                KeyError
                #for unknown words we encounter:
                word = 0 
            words.append(word)
        new_sentence.append(words)
        new_sentence = preprocessing.sequence.pad_sequences(new_sentence, maxlen=maxlen, padding='post')
        new_sentences.append(new_sentence[0])
    return(new_sentences)

#downloading stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

#returing word index
def get_index(filename): #for opening that json file
    with open(filename, 'r') as f:
        data = json.load(f)
    return(data)
word_index = get_index('word_index.json')

#opening and processing a file
def get_data(filename):
    with open(filename, 'r', encoding='unicode_escape') as f:
        data = f.read()
    data = data.lower()

    nltk.download('stopwords')
    stop_words = stopwords.words('english')
    data = data.split()
    data = [w for w in data if w not in stop_words]
    data = " ".join(data)

    return(data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Test model on a string

In [9]:
#uploading the model
model = keras.models.load_model('models/it_or_not.model')

#function for preparing a sentence or a text
def predict_string(string):
    prep_sent = [w.lower() for w in string.split() if w not in stop_words]
    prep_sent = ' '.join(prep_sent)
    test_sents = create_sents(prep_sent)
    test_padded = padding_data(test_sents, word_index, maxlen=50)
    test_array = np.array(test_padded)
    
    prediction = model.predict(test_array)
    print(prediction)

t_sent1 = 'Peppa Pig has received her excellency in her palace'
predict_string(t_sent1)


[[0.02006012]]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
t_sent2 = 'Machine learning (ML) is the study of computer algorithms that can improve automatically through experience and by the use of data.'
predict_string(t_sent2)

[[0.9926444]]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Test model on a file

In [5]:
def predict_file(filename):
    text = get_data(filename)
    test_sents = create_sents(text)
    print(test_sents)
    test_padded = padding_data(test_sents, word_index, maxlen=25)
    test_array = np.array(test_padded)
    
    prediction = model.predict(test_array)
    
    number = 0
    
    for pred in prediction:
            print(pred)
            pred = float(pred)
            number = number + pred

    average = round((number / len(prediction)), 7)
    str_average = str(average)
    if average >= 0.5:
        print('AVERAGE:', str_average+';', 'the text is IT-oriented')
    else:
        print('AVERAGE:', str_average+';', 'the text is ordinary (not IT-oriented')

prediction = predict_file('data/testing.TXT')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['machine learning (ml) study computer algorithms improve automatically experience use data.', '[1] seen part artificial intelligence.', 'machine learning algorithms build model based sample data, known training data, order make predictions decisions without explicitly programmed so.', '[2] machine learning algorithms used wide variety applications, medicine, email filtering, speech recognition, computer vision, difficult unfeasible develop conventional algorithms perform needed tasks.', '[3] subset machine learning closely related computational statistics, focuses making predictions using computers; machine learning statistical learning.', 'study mathematical optimization delivers methods, theory application domains field machine learning.', 'data mining related field study, focusing exploratory data analysis unsupervised learning.', '[5][6] implementations machine learning use data neural networks way mimics working biological brain.', '[7][8] application across business problems, ma

In [None]:
t_sent3 = input('Enter a sentence: ')
predict_sent(t_sent3)