In [1]:
import pandas as pd
data = pd.read_csv('data_2008_2019.csv', error_bad_lines=False);
data_text = data[['Description of incident']]
data_text['index'] = data_text.index
documents = data_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


In [2]:
print(len(documents))
print(documents[:5])

5459
                             Description of incident  index
0  A reporter requested statistics from the State...      0
1  Residents who applied for the Massachusetts Co...      1
2  A class action suit was filed against Netflix,...      2
3  A briefcase full of sensitive personnel record...      3
4  Users of the do-it-yourself trading site colle...      4


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Priyanka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
doc_sample = documents[documents['index'] == 1000].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['A', 'court', 'clerk', 'took', 'court', 'documents', 'home', 'in', 'an', 'attempt', 'to', 'hide', 'the', 'fact', 'that', 'she', 'had', 'not', 'done', 'the', 'work.', '\xa0The', 'employee', 'was', 'fired', 'and', 'could', 'be', 'charged', 'with', 'tampering', 'with', 'public', 'records.', '\xa0No', 'malicious', 'intent', 'is', 'suspected.', '\xa0It', 'is', 'believed', 'that', 'the', 'employee', 'hid', 'years', 'of', 'backlogged', 'records', 'and', 'eventually', 'took', 'them', 'home', 'to', 'continue', 'concealing', 'them.']


 tokenized and lemmatized document: 
['court', 'clerk', 'take', 'court', 'document', 'home', 'attempt', 'hide', 'fact', 'work', 'employe', 'fire', 'charg', 'tamper', 'public', 'record', 'malici', 'intent', 'suspect', 'believ', 'employe', 'year', 'backlog', 'record', 'eventu', 'take', 'home', 'continu', 'conceal']


In [5]:
processed_docs = documents['Description of incident'].map(preprocess)
processed_docs[:10]

0    [report, request, statist, state, depart, publ...
1    [resid, appli, massachusett, commonwealth, sol...
2    [class, action, suit, file, netflix, unit, sta...
3    [briefcas, sensit, personnel, record, steal, v...
4    [user, trade, site, collect, receiv, urgent, m...
5    [patient, notifi, laptop, steal, protect, heal...
6    [lynn, coupl, accus, sell, ident, worker, loga...
7     [bank, account, inform, custom, incorrect, mail]
8    [person, document, palisad, mall, west, nyack,...
9    [file, name, social, secur, number, address, p...
Name: Description of incident, dtype: object

In [6]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 address
1 awar
2 certif
3 depart
4 email
5 identifi
6 inform
7 issu
8 locat
9 marijuana
10 medic


In [7]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [8]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[200]

[(0, 1),
 (9, 1),
 (24, 1),
 (81, 1),
 (86, 1),
 (110, 1),
 (114, 1),
 (166, 1),
 (428, 1),
 (853, 1)]

In [9]:
# LDA Topic models as a classification vector
import warnings
import logging # This allows for seeing if the model converges. A log file is created.
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_train = gensim.models.ldamulticore.LdaMulticore(
                           corpus=bow_corpus,
                           num_topics=20,
                           id2word=dictionary,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
    lda_train.save('lda_train.model')

In [12]:
#Convert topics to feature vector
# Import label encoder 
from sklearn import preprocessing 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(documents['Description of incident']).toarray()
features.shape

(5459, 10139)

In [13]:
train_vecs = []
for i in range(len(documents)):
    top_topics = lda_train.get_document_topics(bow_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(20)]
    topic_vec.extend([len(data.iloc[i]['Description of incident'])]) # length of description
    train_vecs.append(topic_vec)

In [14]:
import numpy as np
from scipy import sparse
sparse_lda =sparse.csr_matrix(train_vecs)

sparse_lda.shape

(5459, 21)

In [15]:
combine_vecs = sparse.hstack((features, sparse_lda))
combine_vecs.shape

(5459, 10160)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
X = combine_vecs
y = data['Type of breach']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(dual=False, max_iter=10000),
    MultinomialNB(),
    LogisticRegression(n_jobs=-1, C=1e5, max_iter=10000)
]
entries = []
targets = data['Type of breach'].unique()
for model in models:
  model_name = model.__class__.__name__
  model.fit(X_train, y_train)
  y_predict = model.predict(X_test)
  accuracy = accuracy_score(y_predict, y_test)
  entries.append((model_name, accuracy))
train_test_split_df = pd.DataFrame(entries, columns=['model_name', 'accuracy'])
print(train_test_split_df.head())

               model_name  accuracy
0  RandomForestClassifier  0.436508
1               LinearSVC  0.793040
2           MultinomialNB  0.514652
3      LogisticRegression  0.788767


In [37]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
entries = []
CV = 10
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=CV, n_jobs=-1 )
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC                 0.797010
LogisticRegression        0.779791
MultinomialNB             0.538004
RandomForestClassifier    0.450818
Name: accuracy, dtype: float64

In [38]:
# Store the best model

import pickle

#
# Create your model here (same as above)
#

# Save to file in the current working directory
pkl_filename = "best_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(models[1], file)

In [39]:
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
pickle_model

LinearSVC(dual=False, max_iter=10000)

In [47]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(documents['Description of incident'])
predict = pickle_model.predict(count_vect.transform(["The bank account information of 2,499 customers was incorrectly mailed."]))
print(predict)

ValueError: X has 15119 features per sample; expecting 10160