As we have seen from the data manipulation and exploration, the non-alphabetical bigram are not very powerful discriminators. The following modelling process should exclude the non-alphabetical bigram.

In [8]:
import os
import pandas as pd
os.chdir('/home/qc/Downloads/blueoptima/blueoptima_data/')
sample_data = pd.read_csv('sample_data.csv')

In [9]:
sample_data.groupby('label')['label'].count()

label
.cs      5000
.java    5000
.py      5000
cpp      5000
Name: label, dtype: int64

In [18]:
def ngram_process(data, ngram = 1, filter_status = 'all'):
    import re
#     from nltk.util import ngrams
    gram = []
    for rows in data:
        if filter_status == 'alpha_only':
            tokens = [token for token in rows.split(" ") if (token != "" and token.isalpha())]
        elif filter_status == 'non_alpha':
            tokens = [token for token in rows.split(" ") if (token != "" and not token.isalpha())]
        else:
            tokens = [token for token in rows.split(" ") if token != ""]
        n_grams = zip(*[tokens[i:] for i in range(ngram)])
#         n_grams = list(ngrams(tokens, ngram))
        
#         n_grams = [re.sub("[(),']", "", str(t)) for t in n_grams]
        n_grams = [re.sub("['',]", "", str(t)[2: -2]) for t in n_grams]
        gram.append(n_grams) 
    return gram

In [16]:
def user_fit(docs):
    import numpy as np
    from scipy.sparse import csr_matrix 
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}

    for d in docs:
        for term in d:
            index = vocabulary.setdefault(term, len(vocabulary))
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))
    return csr_matrix((data, indices, indptr), dtype = np.int64)

In [4]:
from sklearn.model_selection import train_test_split
X = sample_data['text'] 
y = sample_data['.java']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

In [8]:
#non-alphabetics unigram
unigram_vectorizer = user_fit(ngram_process(X, filter_status = 'non_alpha'))

user_count_train = unigram_vectorizer[train_index, :]
user_count_test = unigram_vectorizer[test_index, :]

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(user_count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(user_count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9603030303030303
[[4722  220]
 [  42 1616]]


[0.8801742919389978, 0.9746682750301568]

In [9]:
# non-alphabetics cv
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, user_count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.8883388338833883, 0.9664871334530222]

In [10]:
# alpha-only unigram
unigram_vectorizer = user_fit(ngram_process(X, filter_status = 'alpha_only'))

user_count_train = unigram_vectorizer[train_index, :]
user_count_test = unigram_vectorizer[test_index, :]

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

nb_classifier.fit(user_count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(user_count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9778787878787879
[[4820  122]
 [  24 1634]]


[0.9305239179954442, 0.985524728588661]

In [11]:
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, user_count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.9374288964732651, 0.9862357869539198]

In [12]:
# all unigrams (non-alphabetical and alpha-only)
unigram_vectorizer = user_fit(ngram_process(X))

user_count_train = unigram_vectorizer[train_index, :]
user_count_test = unigram_vectorizer[test_index, :]

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(user_count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(user_count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9831818181818182
[[4863   79]
 [  32 1626]]


[0.9536656891495601, 0.9806996381182147]

In [13]:
# cross validation
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, user_count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.952576112412178, 0.9736684619988031]

In [19]:
# alpha-only unigram and alpha-only bigram
import scipy
unigram_vectorizer = user_fit(ngram_process(X, ngram = 1, filter_status = 'alpha_only'))
bigram_vectorizer = user_fit(ngram_process(X, ngram = 2, filter_status = 'alpha_only'))
user_count = scipy.sparse.hstack((unigram_vectorizer, bigram_vectorizer)).tocsr()

user_count_train = user_count[train_index,:]
user_count_test = user_count[test_index, :]

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score


# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(user_count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(user_count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# # Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9966239710698345
[[72009   247]
 [   20  6811]]


[0.9650042504958912, 0.9970721709852145]

In [20]:
# alpha-only unigram and bigram cv
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, user_count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.9715539201811492, 0.9966608594657376]

In [14]:
# all unigrams and alpha-only bigram
import scipy
unigram_vectorizer = user_fit(ngram_process(X))
bigram_vectorizer = user_fit(ngram_process(X, ngram = 2, filter_status = 'alpha_only'))
user_count = scipy.sparse.hstack((unigram_vectorizer, bigram_vectorizer)).tocsr()

user_count_train = user_count[train_index,:]
user_count_test = user_count[test_index, :]

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score


# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(user_count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(user_count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# # Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9856060606060606
[[4864   78]
 [  17 1641]]


[0.9546247818499127, 0.9897466827503015]

In [20]:
# alpha-only unigram and bigram cv
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, user_count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.9715539201811492, 0.9966608594657376]

In [18]:
# alpha-only using sklearn CountVectorizer
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, user_count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9956060606060606
[[4931   11]
 [  18 1640]]


[0.9933373712901272, 0.9891435464414958]

In [19]:
# alpha-only cv
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, user_count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.9563445867287543, 0.983243566726511]

In [5]:
# alpha-only and alpha-only bigram
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

count_vectorizer = CountVectorizer(analyzer= 'word', ngram_range=[1, 2])
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9972727272727273
[[4940    2]
 [  16 1642]]


[0.9987834549878345, 0.9903498190591074]

In [6]:
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.95276266819353, 0.9958108916816277]

In [1]:
# Using full (truncated) data set now.
import os
import pandas as pd
os.chdir('/home/qc/Downloads/blueoptima/blueoptima_data/')
final_data = pd.read_csv('final_trunc_data.csv')

In [10]:
final_data.groupby('label')['label'].count()

label
.cs       86995
.java    119902
.py       20607
cpp       12153
Name: label, dtype: int64

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

X = final_data['text'] 
y = final_data['.java']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9975596494999178
[[39545    52]
 [  141 39349]]


[0.9986802365422197, 0.9964294758166624]

In [3]:
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(nb_classifier, count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.998032501089596, 0.9967044719693579]

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a random forest classifier: rf_classifier
rf_classifier = RandomForestClassifier(random_state=53)
rf_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = rf_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]



0.998862012720169
[[39591     6]
 [   84 39406]]


[0.9998477621029128, 0.9978728792099265]

In [5]:
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(rf_classifier, count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

[0.9999377102279806, 0.9981719146396061]

In [None]:
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a support vector machine classifier: svm_classifier
svm_classifier = SVC(random_state=53, gamma='scale')
svm_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = svm_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.993083566199249
[[39518    79]
 [  468 39022]]


[0.9979795913148002, 0.9881488984553052]

In [None]:
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(svm_classifier, count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a logistic classifier: lr_classifier
lr_classifier = LogisticRegression(random_state=53)
lr_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = lr_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]



0.9989505228419335
[[39554    43]
 [   40 39450]]


[0.9989111994530676, 0.9989870853380602]

In [None]:
from sklearn.model_selection import cross_val_predict
cross_pred =cross_val_predict(lr_classifier, count_train, y_train, cv = 10)
[precision_score(y_train, cross_pred), recall_score(y_train, cross_pred)]



[0.999377776671603, 0.9986942247425757]

Out of pure interest, multinomial naive bayesian is also used to predict other coding languages

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

X = final_data['text'] 
y = final_data['.cs']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9978504684714302
[[50215   134]
 [   36 28702]]


[0.9953530309335553, 0.9987473032222145]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

X = final_data['text'] 
y = final_data['cpp']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9976608039247917
[[74887   172]
 [   13  4015]]


[0.958920468115596, 0.996772591857001]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

X = final_data['text'] 
y = final_data['.py']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]

0.9991781202978998
[[72198    58]
 [    7  6824]]


[0.991572217378669, 0.998975259844825]

Multinomial NB seems to make more mistakes in terms of false positives, we therefore try to use random forest classifier to improve.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

X = final_data['text'] 
y = final_data['.cs']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a random forest classifier: rf_classifier
rf_classifier = RandomForestClassifier(random_state=53)

rf_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = rf_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]



0.9993424962383198
[[50325    24]
 [   28 28710]]


[0.9991647525579453, 0.9990256802839446]

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

X = final_data['text'] 
y = final_data['cpp']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)


from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a random forest classifier: rf_classifier
rf_classifier = RandomForestClassifier(random_state=53)

rf_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = rf_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]



0.998596482354875
[[75058     1]
 [  110  3918]]


[0.999744832865527, 0.9726911618669315]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

X = final_data['text'] 
y = final_data['.py']
indices = range(X.shape[0])
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, indices, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(analyzer= 'word')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)


from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

# Instantiate a random forest classifier: rf_classifier
rf_classifier = RandomForestClassifier(random_state=53)

rf_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = rf_classifier.predict(count_test)
 
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(cm)

[precision_score(y_test, pred), recall_score(y_test, pred)]



0.9985459051424381
[[72251     5]
 [  110  6721]]


[0.9992566161165626, 0.9838969404186796]

False Positives are greatly reduced using random forest classifier in the trade of an increase in false negatives however.