In [1]:
import pandas as pd
from pandas import DataFrame
import nltk
from tqdm import tqdm
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
import time
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

In [2]:
def tokenize(raw):
    return [w.lower() for w in word_tokenize(raw) if w.isalpha()]

class StemmedTfidfVectorizer(TfidfVectorizer):
    en_stemmer = SnowballStemmer('english')
    
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfVectorizer.en_stemmer.stem(w) for w in analyzer(doc))

In [3]:
@contextmanager
def timer(task_name="timer"):
    # a timer cm from https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    print("----{} started".format(task_name))
    t0 = time.time()
    yield
    print("----{} done in {:.0f} seconds".format(task_name, time.time() - t0))

In [4]:
with timer("reading_data"):
    train = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv('../input/test.csv')

----reading_data started
----reading_data done in 4 seconds


In [5]:
## split to train and val
train_df, val_df = train_test_split(train, test_size=0.1, random_state=2018)

In [6]:
tfidf = StemmedTfidfVectorizer(
    tokenizer=tokenize, 
    analyzer="word", 
    stop_words='english', 
    ngram_range=(1,1), 
    min_df=3    # limit of minimum number of counts: 3
)

with timer('tfidf train'):
    txt_all = pd.concat([train.question_text, test_df.question_text])
    tfidf.fit(txt_all)
    
with timer('construct training and validation dataset'):
    train_X = tfidf.transform(train_df.question_text)
    val_X = tfidf.transform(val_df.question_text)
    
with timer('transforming the test set'):
    test_X = tfidf.transform(test_df['question_text'])
    
## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

----tfidf train started
----tfidf train done in 438 seconds
----construct training and validation dataset started
----construct training and validation dataset done in 341 seconds
----transforming the test set started
----transforming the test set done in 97 seconds


In [8]:
clf = MultinomialNB().fit(train_X, train_y)
y_val_pred = clf.predict_proba(val_X)

In [9]:
# threshold search
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, f1_score(val_y, (y_val_pred[:,1] > thresh).astype(int))))

F1 score at threshold 0.1 is 0.44418621029649974
F1 score at threshold 0.11 is 0.45708724781974025
F1 score at threshold 0.12 is 0.47062549485352334
F1 score at threshold 0.13 is 0.48076682642217133
F1 score at threshold 0.14 is 0.4880437158469945
F1 score at threshold 0.15 is 0.4961531416010258
F1 score at threshold 0.16 is 0.5007895114598785
F1 score at threshold 0.17 is 0.5039777247414479
F1 score at threshold 0.18 is 0.506275720164609
F1 score at threshold 0.19 is 0.5082684305985277
F1 score at threshold 0.2 is 0.5086259163313674
F1 score at threshold 0.21 is 0.5097727272727272
F1 score at threshold 0.22 is 0.5101861093172078
F1 score at threshold 0.23 is 0.5090337784760408
F1 score at threshold 0.24 is 0.5083023543990087
F1 score at threshold 0.25 is 0.5039099752050353
F1 score at threshold 0.26 is 0.49882598486824936
F1 score at threshold 0.27 is 0.49295115921694393
F1 score at threshold 0.28 is 0.48753159800505574
F1 score at threshold 0.29 is 0.4817150063051703
F1 score at thre

In [20]:
y_pred = clf.predict_proba(test_X)
pred_test_y = (y_pred[:, 1] > 0.22).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)