In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from gensim.models import KeyedVectors

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
sklearn_tokenizer = vect.build_tokenizer()

df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
tokenized = [sklearn_tokenizer(sent) for sent in df.question_text]


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
w2v = KeyedVectors.load_word2vec_format('/kaggle/input/googlenewsvectors/GoogleNews-vectors-negative300-SLIM.bin', binary=True)

In [None]:
def get_w2v_vect(token, w2v):
    token = token.lower()

    if token in w2v.index_to_key:
        return w2v[token]
    else:
        return np.zeros(w2v.vector_size)
    
def get_features_from_text(tokenized_sent, w2v):
    vect = np.array([get_w2v_vect(token, w2v) 
                    for token in tokenized_sent]).mean(axis=0)
  
    vect_norm = np.linalg.norm(vect)
    if vect_norm != 0:
        return vect/vect_norm
    else:
        return vect
    
w2v_data = np.array([get_features_from_text(tokenized_sent, w2v) 
                    for tokenized_sent in tokenized[::5]])

In [None]:
y = df.target[::5]

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='modified_huber', penalty='elasticnet', l1_ratio=0.1, alpha=1e-6, 
                    shuffle=True, class_weight={0: y.mean(), 1: 1-y.mean()}, random_state=42)


In [None]:
from sklearn.model_selection import cross_val_predict, StratifiedKFold, RandomizedSearchCV
preds = cross_val_predict(clf, w2v_data, y, cv=StratifiedKFold(5), 
                          n_jobs=1, method='predict_proba')

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, f1_score

In [None]:
pred_target = f1_score(y,preds[:,1]>=0.8)

In [None]:
pd.DataFrame({"qid":df["qid"], "prediction":pred_target}).to_csv("submission.csv", index=False)