In [8]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy import sparse
import pickle
import nltk
from gensim import models
from nltk.stem import WordNetLemmatizer, SnowballStemmer

In [9]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [10]:
# calculate bow corpus for a text
def bowcorpus(textArray):
    preprocesstext = [preprocess(text) for text in textArray]
    dictionary = gensim.corpora.Dictionary(preprocesstext)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    return [dictionary.doc2bow(doc) for doc in preprocesstext]

In [11]:
def ldaVecs(docs, corpus, ldaModel):
    train_vecs = []
    for i in range(len(docs)):
        top_topics = ldaModel.get_document_topics(corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(20)]
#         topic_vec.extend([len(docs[i])]) # length of description
        train_vecs.append(topic_vec)
    return train_vecs

In [12]:
def getVectors(testData):
    tfidfVectorizer = pickle.load(open("saved_models/tfidf.pickle", "rb"))
    tfidfVectors = tfidfVectorizer.transform(testData)
    # get lda train model
    lda_train =  models.LdaModel.load('saved_models/lda_train.model')
    ldaVectors = ldaVecs(testData, bowcorpus(testData), lda_train)
    sparse_lda =sparse.csr_matrix(ldaVectors)
    return sparse.hstack((tfidfVectors, sparse_lda))

In [13]:
def predict(testData):
    np.random.seed(2018)
    nltk.download('wordnet')
    pkl_filename = "saved_models/best_model.pkl"
    # Load from file
    with open(pkl_filename, 'rb') as file:
        pickle_model = pickle.load(file)
        
    testVectors = getVectors(testData)
    return pickle_model.predict(testVectors)

In [14]:
prediction = predict(["An office theft of an unencrypted laptop on or around December 15 resulted in the exposure of confidential personal information. Â The laptop contained an Excel spreadsheet with workers' compensation information such as names, Social Security numbers, telephone numbers, and other workers' compensation claim or injury information."])
print(prediction)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Priyanka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['PORT']
