## Quora Incinsere Questions Classification via Logistic Regression

In [None]:
# Import required library
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
# import spacy
import re
from tqdm import tqdm
import nltk
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
# nltk.download('punkt')
# nltk.download('wordnet')

## Load and print data

In [None]:
train_raw = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
validation_data = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
train_raw

### Data
- 1306122 row × 3 column
- Features include question ID, question, target of the question
- No percentage between different types of target

In [None]:
plot_data = train_raw['target']
plot_data.value_counts().plot(kind='bar', rot=0)

## Pre process raw data, add features to observe and utilize
- Number of words
- Number of unique words
- Number of special characters
- Number of upper-case words
- Number of lower-case words
- Number of tittle-case words

In [None]:
def create_features(df_):
    
    df_["nb_words"] = df_["question_text"].apply(lambda x: len(x.split())) # Number of words
    df_["nb_unique_words"] = df_["question_text"].apply(lambda x: len(set(str(x).split()))) # Number of unique words
    df_["nb_chars"] = df_["question_text"].apply(lambda x: len(str(x))) # Number of characters
    df_['spe_chars'] = df_['question_text'].str.findall(r'[^a-zA-Z0-9 ]').str.len() # Number of special characters
    df_["nb_uppercase"] = df_["question_text"].apply(lambda x : len([nu for nu in str(x).split() if nu.isupper()])) # Number of uppercase characters
    df_["nb_lowercase"] = df_["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.islower()])) # Number of lowercase characters
    df_["nb_title"] = df_["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.istitle()])) # Number of tittle case

    return df_

train_features = create_features(train_raw)
train_features

## Sincere questions data

In [None]:
train_features[train_features['target'] == 0].describe().round(1)

## Insincere questions data

In [None]:
train_features[train_features['target'] == 1].describe().round(1)

### Observation:
- Insincere questions tends to have larger mean in the number of words than sincere questions
- Word vector can be used (word vectors are capable of holding features such as number or words, number of unique words..)

In [None]:
## Câu hỏi insincere
print(train_raw['question_text'][(train_raw['target']==1)].sample(10).values)

### Pre-processing evaluation:
- Insincered question often has negative meaning words, this feature is not dependent on grammar
#### => Word counting vector can be used since grammar of the question can be discarded
- Special characters, numbers, links, uppercase or lowercase letter often don't affect the classification of the questions
#### => They can be discarded

### Cleaning data from abnormal cases
- From the statistics table, observe that some questions has the number of characters, special characters larger than the mean of the whole dataset
#### => Consider remove these cases

In [None]:
train_features['question_text'][(train_raw['target']==1) & (train_features['nb_chars']>600.0)].values
train_features['question_text'][(train_raw['target']==1) & (train_features['spe_chars']>30.0)].values

In [None]:
train_features['question_text'][(train_raw['target']==0) & (train_features['nb_chars']>600.0)].values
train_features['question_text'][(train_raw['target']==0) & (train_features['spe_chars']>30.0)].values

### Observation:
- Most questions that has a lot of special characters are questions with mathematical formular or figurative characters
- For questions with mathematical formular that is classified as insincere, they can be considered anomaly and can be removed from the dataset

### Remove questions which has number of characters, special characters exceeding threshold

In [None]:
# train_features_filtered = train_features.drop(train_features[(train_features['nb_chars'] >= 600) & (train_features['nb_unique_words']>35.0)].index)
train_features_filtered = train_features[(train_features['nb_chars']<600.0) & (train_features['nb_words']<70.0) & (train_features['spe_chars']<12.0)]
train_features_filtered.describe().round(1)

In [None]:
print(len(train_features) - len(train_features_filtered))

#### 3494 rows removed

## Resample
- The number of sincere questions is far greater than the number of insincere questions, the ratio is not balanced
#### => The dataset needs to be resampled to the ratio of 4:1

In [None]:
# Resampling
from sklearn.utils import resample

# sincere = train_raw[train_raw.target == 0]
# insincere = train_raw[train_raw.target == 1]

sincere = train_features_filtered[train_features_filtered.target == 0]
insincere = train_features_filtered[train_features_filtered.target == 1]

# ratio 1:1
# x = pd.concat([resample(sincere,
#                      replace = False,
#                      n_samples = len(insincere)), insincere])

# ratio 2:1
# x = pd.concat([resample(sincere,
#                      replace = True,
#                      n_samples = len(insincere)*2), insincere])

# 4:1
resampled_dataset = pd.concat([resample(sincere,
                     replace = True,
                     n_samples = len(insincere)*4), insincere])
print(len(resampled_dataset[resampled_dataset['target'] == 0]) / len(resampled_dataset[resampled_dataset['target'] == 1]))

In [None]:
plot_data = resampled_dataset['target']
plot_data.value_counts().plot(kind='bar', rot=0)

### Training dataset is now balanced

## Process data
### Loại bỏ các dữ liệu không cần thiết và chuyển dữ liệu về dạng nguyên gốc:
- Remove links
- Remove special characters
- Transform variants of one word into one consistent word
- Transform shortened word into original word
- Remove numbers
- Remove latex tags

In [None]:
# Remove links
def clean_tag(question):
    if 'http' in question or 'www' in question:
        question = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+', '[url]', question) #replacing with [url]
    return question

In [None]:
# Transform shortened into original

contraction_mapping = {"We'd": "We had", "That'd": "That had", "AREN'T": "Are not", "HADN'T": "Had not", "Could've": "Could have", "LeT's": "Let us", "How'll": "How will", "They'll": "They will", "DOESN'T": "Does not", "HE'S": "He has", "O'Clock": "Of the clock", "Who'll": "Who will", "What'S": "What is", "Ain't": "Am not", "WEREN'T": "Were not", "Y'all": "You all", "Y'ALL": "You all", "Here's": "Here is", "It'd": "It had", "Should've": "Should have", "I'M": "I am", "ISN'T": "Is not", "Would've": "Would have", "He'll": "He will", "DON'T": "Do not", "She'd": "She had", "WOULDN'T": "Would not", "She'll": "She will", "IT's": "It is", "There'd": "There had", "It'll": "It will", "You'll": "You will", "He'd": "He had", "What'll": "What will", "Ma'am": "Madam", "CAN'T": "Can not", "THAT'S": "That is", "You've": "You have", "She's": "She is", "Weren't": "Were not", "They've": "They have", "Couldn't": "Could not", "When's": "When is", "Haven't": "Have not", "We'll": "We will", "That's": "That is", "We're": "We are", "They're": "They' are", "You'd": "You would", "How'd": "How did", "What're": "What are", "Hasn't": "Has not", "Wasn't": "Was not", "Won't": "Will not", "There's": "There is", "Didn't": "Did not", "Doesn't": "Does not", "You're": "You are", "He's": "He is", "SO's": "So is", "We've": "We have", "Who's": "Who is", "Wouldn't": "Would not", "Why's": "Why is", "WHO's": "Who is", "Let's": "Let us", "How's": "How is", "Can't": "Can not", "Where's": "Where is", "They'd": "They had", "Don't": "Do not", "Shouldn't":"Should not", "Aren't":"Are not", "ain't": "is not", "What's": "What is", "It's": "It is", "Isn't":"Is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def clean_contractions(question):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        question = question.replace(s, "'")
    
    question = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in question.split(" ")])
    return question

In [None]:
# Tranform variants of words in to one consistent variants
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
l = WordNetLemmatizer()

def lemmatize_text(question):
    question = ' '.join([l.lemmatize(word) for word in word_tokenize(question)])
    return question

In [None]:
# Remove special characters
spec_chars = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', 
        '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 
        '█', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '¬', '░', '¡', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
        '—', '‹', '─', '▒', '：', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '⋅', '‘', '∞', 
        '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '・', '╦', '╣', '╔', '╗', '▬', '❤', '≤', '‡', '√', '◄', '━', 
        '⇒', '▶', '≥', '╝', '♡', '◊', '。', '✈', '≡', '☺', '✔', '↵', '≈', '✓', '♣', '☎', '℃', '◦', '└', '‟', '～', '！', '○', 
        '◆', '№', '♠', '▌', '✿', '▸', '⁄', '□', '❖', '✦', '．', '÷', '｜', '┃', '／', '￥', '╠', '↩', '✭', '▐', '☼', '☻', '┐', 
        '├', '«', '∼', '┌', '℉', '☮', '฿', '≦', '♬', '✧', '〉', '－', '⌂', '✖', '･', '◕', '※', '‖', '◀', '‰', '\x97', '↺', 
        '∆', '┘', '┬', '╬', '،', '⌘', '⊂', '＞', '〈', '⎙', '？', '☠', '⇐', '▫', '∗', '∈', '≠', '♀', '♔', '˚', '℗', '┗', '＊', 
        '┼', '❀', '＆', '∩', '♂', '‿', '∑', '‣', '➜', '┛', '⇓', '☯', '⊖', '☀', '┳', '；', '∇', '⇑', '✰', '◇', '♯', '☞', '´', 
        '↔', '┏', '｡', '◘', '∂', '✌', '♭', '┣', '┴', '┓', '✨', '\xa0', '˜', '❥', '┫', '℠', '✒', '［', '∫', '\x93', '≧', '］', 
        '\x94', '∀', '♛', '\x96', '∨', '◎', '↻', '⇩', '＜', '≫', '✩', '✪', '♕', '؟', '₤', '☛', '╮', '␊', '＋', '┈', '％', 
        '╋', '▽', '⇨', '┻', '⊗', '￡', '।', '▂', '✯', '▇', '＿', '➤', '✞', '＝', '▷', '△', '◙', '▅', '✝', '∧', '␉', '☭', 
        '┊', '╯', '☾', '➔', '∴', '\x92', '▃', '↳', '＾', '׳', '➢', '╭', '➡', '＠', '⊙', '☢', '˝', '∏', '„', '∥', '❝', '☐', 
        '▆', '╱', '⋙', '๏', '☁', '⇔', '▔', '\x91', '➚', '◡', '╰', '\x85', '♢', '˙', '۞', '✘', '✮', '☑', '⋆', 'ⓘ', '❒', 
        '☣', '✉', '⌊', '➠', '∣', '❑', '◢', 'ⓒ', '\x80', '〒', '∕', '▮', '⦿', '✫', '✚', '⋯', '♩', '☂', '❞', '‗', '܂', '☜', 
        '‾', '✜', '╲', '∘', '⟩', '＼', '⟨', '·', '✗', '♚', '∅', 'ⓔ', '◣', '͡', '‛', '❦', '◠', '✄', '❄', '∃', '␣', '≪', '｢', 
        '≅', '◯', '☽', '∎', '｣', '❧', '̅', 'ⓐ', '↘', '⚓', '▣', '˘', '∪', '⇢', '✍', '⊥', '＃', '⎯', '↠', '۩', '☰', '◥', 
        '⊆', '✽', '⚡', '↪', '❁', '☹', '◼', '☃', '◤', '❏', 'ⓢ', '⊱', '➝', '̣', '✡', '∠', '｀', '▴', '┤', '∝', '♏', 'ⓐ', 
        '✎', ';', '␤', '＇', '❣', '✂', '✤', 'ⓞ', '☪', '✴', '⌒', '˛', '♒', '＄', '✶', '▻', 'ⓔ', '◌', '◈', '❚', '❂', '￦', 
        '◉', '╜', '̃', '✱', '╖', '❉', 'ⓡ', '↗', 'ⓣ', '♻', '➽', '׀', '✲', '✬', '☉', '▉', '≒', '☥', '⌐', '♨', '✕', 'ⓝ', 
        '⊰', '❘', '＂', '⇧', '̵', '➪', '▁', '▏', '⊃', 'ⓛ', '‚', '♰', '́', '✏', '⏑', '̶', 'ⓢ', '⩾', '￠', '❍', '≃', '⋰', '♋', 
        '､', '̂', '❋', '✳', 'ⓤ', '╤', '▕', '⌣', '✸', '℮', '⁺', '▨', '╨', 'ⓥ', '♈', '❃', '☝', '✻', '⊇', '≻', '♘', '♞', 
        '◂', '✟', '⌠', '✠', '☚', '✥', '❊', 'ⓒ', '⌈', '❅', 'ⓡ', '♧', 'ⓞ', '▭', '❱', 'ⓣ', '∟', '☕', '♺', '∵', '⍝', 'ⓑ', 
        '✵', '✣', '٭', '♆', 'ⓘ', '∶', '⚜', '◞', '்', '✹', '➥', '↕', '̳', '∷', '✋', '➧', '∋', '̿', 'ͧ', '┅', '⥤', '⬆', '⋱', 
        '☄', '↖', '⋮', '۔', '♌', 'ⓛ', '╕', '♓', '❯', '♍', '▋', '✺', '⭐', '✾', '♊', '➣', '▿', 'ⓑ', '♉', '⏠', '◾', '▹', 
        '⩽', '↦', '╥', '⍵', '⌋', '։', '➨', '∮', '⇥', 'ⓗ', 'ⓓ', '⁻', '⎝', '⌥', '⌉', '◔', '◑', '✼', '♎', '♐', '╪', '⊚', 
        '☒', '⇤', 'ⓜ', '⎠', '◐', '⚠', '╞', '◗', '⎕', 'ⓨ', '☟', 'ⓟ', '♟', '❈', '↬', 'ⓓ', '◻', '♮', '❙', '♤', '∉', '؛', 
        '⁂', 'ⓝ', '־', '♑', '╫', '╓', '╳', '⬅', '☔', '☸', '┄', '╧', '׃', '⎢', '❆', '⋄', '⚫', '̏', '☏', '➞', '͂', '␙', 
        'ⓤ', '◟', '̊', '⚐', '✙', '↙', '̾', '℘', '✷', '⍺', '❌', '⊢', '▵', '✅', 'ⓖ', '☨', '▰', '╡', 'ⓜ', '☤', '∽', '╘', 
        '˹', '↨', '♙', '⬇', '♱', '⌡', '⠀', '╛', '❕', '┉', 'ⓟ', '̀', '♖', 'ⓚ', '┆', '⎜', '◜', '⚾', '⤴', '✇', '╟', '⎛', 
        '☩', '➲', '➟', 'ⓥ', 'ⓗ', '⏝', '◃', '╢', '↯', '✆', '˃', '⍴', '❇', '⚽', '╒', '̸', '♜', '☓', '➳', '⇄', '☬', '⚑', 
        '✐', '⌃', '◅', '▢', '❐', '∊', '☈', '॥', '⎮', '▩', 'ு', '⊹', '‵', '␔', '☊', '➸', '̌', '☿', '⇉', '⊳', '╙', 'ⓦ', 
        '⇣', '｛', '̄', '↝', '⎟', '▍', '❗', '״', '΄', '▞', '◁', '⛄', '⇝', '⎪', '♁', '⇠', '☇', '✊', 'ி', '｝', '⭕', '➘', 
        '⁀', '☙', '❛', '❓', '⟲', '⇀', '≲', 'ⓕ', '⎥', '\u06dd', 'ͤ', '₋', '̱', '̎', '♝', '≳', '▙', '➭', '܀', 'ⓖ', '⇛', '▊', 
        '⇗', '̷', '⇱', '℅', 'ⓧ', '⚛', '̐', '̕', '⇌', '␀', '≌', 'ⓦ', '⊤', '̓', '☦', 'ⓕ', '▜', '➙', 'ⓨ', '⌨', '◮', '☷', 
        '◍', 'ⓚ', '≔', '⏩', '⍳', '℞', '┋', '˻', '▚', '≺', 'ْ', '▟', '➻', '̪', '⏪', '̉', '⎞', '┇', '⍟', '⇪', '▎', '⇦', '␝', 
        '⤷', '≖', '⟶', '♗', '̴', '♄', 'ͨ', '̈', '❜', '̡', '▛', '✁', '➩', 'ா', '˂', '↥', '⏎', '⎷', '̲', '➖', '↲', '⩵', '̗', '❢', 
        '≎', '⚔', '⇇', '̑', '⊿', '̖', '☍', '➹', '⥊', '⁁', '✢']

def clean_spec_chars(question):
  for spec_char in spec_chars:
    if spec_char in question:
      question = question.replace(spec_char, f' {spec_char} ')
  return question

In [None]:
# Remove numeric characters
def clean_numbers(question):
    question = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', question)
    question = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', question)
    question = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', question)
    return question

In [None]:
# Remove latext formula
def clean_latex_formula(question):
    corr_t = []
    for t in question.split(" "):
        t = question.strip()
        if t != '':
            corr_t.append(t)
    question = ' '.join(corr_t)
    question = re.sub('(\[ math \]).+(\[ / math \])', 'math formula', question) # replace with "math formula"
    return question

In [None]:
#combined data cleaning function
def data_cleaning(question):
    question = clean_tag(question)
    question = clean_contractions(question)
    question = clean_spec_chars(question)
    question = lemmatize_text(question)
    question = clean_latex_formula(question)
    question = clean_numbers(question)
    return question

In [None]:
#Process question data on traing and validation dataset
resampled_dataset['question_text'] = resampled_dataset['question_text'].progress_map(lambda resampled_dataset: data_cleaning(resampled_dataset))
validation_data['question_text']=validation_data['question_text'].progress_map(lambda x: data_cleaning(x))

## Spliting test & train data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    resampled_dataset['question_text'], resampled_dataset['target'], test_size=0.2, random_state=0)
print('x_train: ', x_train.shape, y_train.shape)
print('x_test: ',x_test.shape, y_test.shape)

## Create word counting vector train, test
- Word counting vector: turn sentence in to words and number of ocurrance of that word in the sentence
- Classify questions on the basis of grammar independency
- Train on both vocabulary of train & test dataset, since the counting vectors might have to encrypt words from test dataset if it doesn't appear on the train dataset.

In [None]:
vectorizer = CountVectorizer()
# Train on both train & validation dataset
vectorizer.fit(list(resampled_dataset['question_text'].values)+ list(resampled_dataset['question_text'].values))
# Create counting vetors for train, test, validation datasets based on the learned vocabulary
x_tr = vectorizer.transform(x_train) 
x_te = vectorizer.transform(x_test)
x_val = vectorizer.transform(validation_data['question_text'])
print(x_tr.shape)
print(x_te.shape)
print(x_val.shape)

### Logistic Regression
- After obtaining counting vectors, put them through the model to train
- Logistic Regression is used because of its simplicity and superior performance compare to other models such as: Random Forest Classifier, Naive Bayes, SVM

In [None]:
# import models
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression

### Optimize parameters by GridSearchCV
- Parameter C is part of model's input, GridSearchCV search for the best C value of the model

In [None]:
# Find C parameter
from sklearn.model_selection import  GridSearchCV
params = {'C': [0.1, 1, 2, 3, 4, 5, 10]}

gridsearch = GridSearchCV(LogisticRegression(), params, scoring='f1', n_jobs=-1, verbose=1)
gridsearch.fit(x_tr, y_train)
print(gridsearch.best_params_)

In [None]:
%%time
model = LogisticRegression(C=gridsearch.best_params_['C'])
## Run model
print(f"Running Logistic Regression")
model.fit(x_tr, y_train)

train_predictions = model.predict(x_tr)
train_acc = accuracy_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)
print(f"Training accuracy: {train_acc:.2%}, F1: {train_f1:.4f}") 
test_predictions = model.predict(x_te)
test_acc = accuracy_score(y_test, test_predictions) 
test_f1 = f1_score(y_test, test_predictions) 
print(f"Testing accuracy:  {test_acc:.2%}, F1: {test_f1:.4f}")

In [None]:
# Print out confusion matrix, classification_report
import seaborn as sns
from sklearn.metrics import confusion_matrix
sns.set(font_scale=1.4)
sns.heatmap(pd.DataFrame(confusion_matrix(y_test, test_predictions), range(2),range(2)), annot=True, fmt='g')
print(classification_report(y_test, test_predictions))

### Observation:
- F1 and accuracy is quite high
- F1 and accuracy is close to each other once the dataset is balanced
- Model's F1 when predicting insincere questions increase substantially when dataset is balanced
- Model's accuracy when predicting sincere questions decrease (trivially) when dataset is balanced

## Submission
- Trainging complete, run model on the validation dataset and save for submission

In [None]:
# Submission
validation_predictions = model.predict(x_val)
submission = pd.DataFrame({'qid':validation_data['qid'], 'prediction':validation_predictions })
submission.to_csv('submission.csv', index=False)
submission

### F1 score on validation dataset: ~0.6
### F1 score on validation dataset is much lower than of test & train dataset but quite good in comparison to linear model