## IMPORTS 

In This kernel, I will try to use the normal conventional methods on the quora dataset namely:

- TFIDF,
- CountVectorizer, 
- HashVectorizer,
- Word embeddings. 

To get an understanding of these I have created a notebook at: https://mlwhiz.com/blog/2019/02/08/deeplearning_nlp_conventional_methods/

Do have a look at the blog post too.

In [None]:
import random
import copy
import time
import pandas as pd
import numpy as np
import gc
import re
import torch
from torchtext import data
#import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import os 
import nltk
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from unidecode import unidecode

from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from multiprocessing import  Pool
from functools import partial
import numpy as np
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
import lightgbm as lgb

### Basic Parameters

In [None]:
SEED = 1029

## LOAD PROCESSED TRAINING DATA FROM DISK

In [None]:
# Some preprocesssing that will be common to all the text classification methods you will see. 

# Remove punctuations:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, ' ')
    return x

# We won't clean numbers in conventional methods case since we might get extra info from bigrams like 5 mins or 30 mins
def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

# Remove Misspell:
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

# remove stopwords:
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=True):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# remove contractions:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re

contractions, contractions_re = _get_contractions(contraction_dict)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

# Using lemmatizer to keep dictionary form of words. Might be helpful if later we want to use word embeddings.
wordnet_lemmatizer = WordNetLemmatizer()
def lemma_text(text):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)


def clean_sentence(x):
    x = x.lower()
    x = clean_text(x)
    x = replace_typical_misspell(x)
    x = remove_stopwords(x)
    x = replace_contractions(x)
    x = lemma_text(x)
    x = x.replace("'","")
    return x

In [None]:

train_df = pd.read_csv("../input/train.csv")#[:400000]
test_df = pd.read_csv("../input/test.csv")#[:20000]
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
train_df.head()

In [None]:
# clean the sentences
train_df['cleaned_text'] = train_df['question_text'].apply(lambda x : clean_sentence(x))
test_df['cleaned_text'] = test_df['question_text'].apply(lambda x : clean_sentence(x))

In [None]:
# small function to find threshold and find best f score - Eval metric of competition
def bestThresshold(y_train,train_preds):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)):
        tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    # print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
    return tmp[2]

## Lets start with modelling 

In [None]:
# HELPER FUNCTIONS

def model_train_cv(x_train,y_train,nfold,model_obj):
    splits = list(StratifiedKFold(n_splits=nfold, shuffle=True, random_state=SEED).split(x_train, y_train))
    x_train = x_train
    y_train = np.array(y_train)
    # matrix for the out-of-fold predictions
    train_oof_preds = np.zeros((x_train.shape[0]))
    for i, (train_idx, valid_idx) in enumerate(splits):

        x_train_fold = x_train[train_idx.astype(int)]
        y_train_fold = y_train[train_idx.astype(int)]
        x_val_fold = x_train[valid_idx.astype(int)]
        y_val_fold = y_train[valid_idx.astype(int)]

        clf = copy.deepcopy(model_obj)
        clf.fit(x_train_fold, y_train_fold)
        valid_preds_fold = clf.predict_proba(x_val_fold)[:,1]

        # storing OOF predictions
        train_oof_preds[valid_idx] = valid_preds_fold
    return train_oof_preds

def lgb_model_train_cv(x_train,y_train,nfold,lgb):
    splits = list(StratifiedKFold(n_splits=nfold, shuffle=True, random_state=SEED).split(x_train, y_train))
    x_train = x_train
    y_train = np.array(y_train)
    # matrix for the out-of-fold predictions
    train_oof_preds = np.zeros((x_train.shape[0]))
    for i, (train_idx, valid_idx) in enumerate(splits):
        x_train_fold = x_train[train_idx.astype(int)]
        y_train_fold = y_train[train_idx.astype(int)]
        x_val_fold = x_train[valid_idx.astype(int)]
        y_val_fold = y_train[valid_idx.astype(int)]
        d_train = lgb.Dataset(x_train_fold, label=y_train_fold)
        d_val = lgb.Dataset(x_val_fold, label=y_val_fold)
        params = {}
        params['learning_rate'] = 0.01
        params['boosting_type'] = 'gbdt'
        params['objective'] = 'binary'
        params['metric'] = 'binary_logloss'
        params['sub_feature'] = 0.5
        params['num_leaves'] = 10
        params['min_data'] = 50
        params['max_depth'] = 10
        
        clf = lgb.train(params, d_train, num_boost_round = 100,valid_sets=(d_val), early_stopping_rounds=10,verbose_eval=10)
        valid_preds_fold = clf.predict(x_val_fold)
        # storing OOF predictions
        train_oof_preds[valid_idx] = valid_preds_fold
    return train_oof_preds

### 1. Bag of words model using Count Vectorizer:

![count vectorizer](https://mlwhiz.com/images/countvectorizer.png)

In [None]:
cnt_vectorizer = CountVectorizer(dtype=np.float32,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),min_df=3)

# Fitting count vectorizer to both training and test sets (semi-supervised learning)
cnt_vectorizer.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values))
xtrain =  cnt_vectorizer.transform(train_df.cleaned_text.values) 
#xtest_cntv = cnt_vectorizer.transform(test_df.cleaned_text.values)
y_train = train_df.target.values

In [None]:
# Fitting a simple Logistic Regression on CountVectorizer Model
train_oof_preds = model_train_cv(xtrain,y_train,5,LogisticRegression(C=1.0))

print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

We are able to get a F1 local CV score of ___ with our fairly simple model which just counts the number of time some ngrams appear in a sentence. That is pretty good. Let us try Multinomial NB

In [None]:
# fitting a simple Naive Bayes model in place of logistic regression using the same features
train_oof_preds = model_train_cv(xtrain,y_train,5,MultinomialNB())
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))


We are able to get a good F1 local CV score  with our fairly simple model which just counts the number of time some ngrams appear in a sentence. 
You can also try running SVMs which were used extensively when trying out models on Text. But they are pretty slow so not using them here. 

Lets try LightGBM also.

In [None]:
# fitting a simple Naive Bayes model in place of logistic regression using the same features
train_oof_preds = lgb_model_train_cv(xtrain,y_train,5,lgb)



In [None]:
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
xtrain=0
del xtrain
#del xtest_cntv
del cnt_vectorizer
gc.collect()

### 2. Bag of words model using TFIDF:

![tfidf](https://mlwhiz.com/images/tfidf.png)

In [None]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(dtype=np.float32, min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values))
xtrain =  tfv.transform(train_df.cleaned_text.values) 
#xtest_tfv = tfv.transform(test_df.cleaned_text.values)
y_train = train_df.target.values

In [None]:
# Fitting a simple Logistic Regression on TFIDF Feats
train_oof_preds = model_train_cv(xtrain,y_train,5,LogisticRegression(C=1.0))

print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
# fitting a simple Naive Bayes model in place of logistic regression using the same features
train_oof_preds = model_train_cv(xtrain,y_train,5,MultinomialNB())
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
# fitting a simple Naive Bayes model in place of logistic regression using the same features
train_oof_preds = lgb_model_train_cv(xtrain,y_train,5,lgb)


In [None]:
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
xtrain=0
del xtrain
#del xtest_tfv
del tfv
gc.collect()

> ### 3. Bag of words model using Hashing Vectorizer

![](http://)![hashing features](https://mlwhiz.com/images/hashfeats.png)

In [None]:
# Always start with these features. They work (almost) everytime!
hv = HashingVectorizer(dtype=np.float32,
            strip_accents='unicode', analyzer='word',
            ngram_range=(1, 3),n_features=2**10,non_negative=True)
# Fitting Hash Vectorizer to both training and test sets (semi-supervised learning)
hv.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values))
xtrain =  hv.transform(train_df.cleaned_text.values) 
#xtest_hv = hv.transform(test_df.cleaned_text.values)
y_train = train_df.target.values

In [None]:
# Fitting a simple Logistic Regression on TFIDF Feats
train_oof_preds = model_train_cv(xtrain,y_train,5,LogisticRegression(C=1.0))

print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
# fitting a simple Naive Bayes model in place of logistic regression using the same features
train_oof_preds = model_train_cv(xtrain,y_train,5,MultinomialNB())
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
# fitting a simple Lgb model in place of logistic regression using the same features
train_oof_preds = lgb_model_train_cv(xtrain,y_train,5,lgb)

In [None]:
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
xtrain=0
del xtrain
#del xtest_hv
del hv
gc.collect()

### 4. Word2vec Embeddings 

![word2vec](https://mlwhiz.com/images/word2vec_feats.png)

In [None]:
# load the GloVe vectors in a dictionary:
def load_glove_index():
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    return embeddings_index

embeddings_index = load_glove_index()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [None]:
# create sentence vectors using the above function for training and validation set
xtrain = [sent2vec(x) for x in tqdm(train_df.cleaned_text.values)]
#xtest_glove = [sent2vec(x) for x in tqdm(test_df.cleaned_text.values)]

In [None]:
embeddings_index = 0
del embeddings_index
# del xtest_glove
gc.collect()

In [None]:
xtrain = np.array(xtrain)
# xvalid_glove = np.array(xtest_glove)
y_train = train_df.target.values

In [None]:
# Fitting a simple Logistic Regression on glove Feats
train_oof_preds = model_train_cv(xtrain,y_train,5,LogisticRegression(C=1.0))
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

In [None]:
# fitting a simple Lgb model in place of logistic regression using the same features
train_oof_preds = lgb_model_train_cv(xtrain,y_train,5,lgb)

In [None]:
print ("F1 Score: %0.3f " % bestThresshold(y_train,train_oof_preds))

So this is it. All of these models are not tuned yet and could be tuned further to improve performance. But it is good to get sort of baselines and appreciate the sort of performance we can get out of neural nets 

References:
    https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle