In [1]:
# text manipulation
import re
import string

# Data management
import pandas as pd
import numpy as np
from scipy.sparse import *
import scipy

# NLP
import nltk
import nltk.collocations as collocations
from nltk.tag import tnt
import spacy
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Doc2Vec, FastText
from collections import defaultdict
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# sklearn
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer


# keras
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing
from IPython.display import display, clear_output

%matplotlib inline



In [2]:
train = pd.read_csv('./train.csv')

In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train.shape

(1306122, 3)

In [5]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


In [12]:
clean_questions = (re.sub("[^A-Za-z']+", ' ', q).lower() for q in train['question_text'])

In [14]:
train['clean_question'] = [q for q in clean_questions]

In [15]:
train.head()

Unnamed: 0,qid,question_text,target,clean_question
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,how did quebec nationalists see their province...
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,do you have an adopted dog how would you encou...
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,why does velocity affect time does velocity af...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,how did otto von guericke used the magdeburg h...
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,can i convert montra helicon d to a mountain b...


### Web Predict Model Test

In [16]:
loaded_model = joblib.load('./TokenNBModel.pkl')

In [19]:
loaded_CountVect = joblib.load('./CountVectRaw.pkl')

In [22]:
data = loaded_CountVect.transform([train.clean_question[1]])

In [24]:
loaded_model.predict_proba(data)[0][1]

0.007029159454879292