## Imports

In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('/kaggle/input/yahoo-troll-question-detection/train_df.csv')
df_test = pd.read_csv('/kaggle/input/yahoo-troll-question-detection/test_df.csv')

print('Train shape: ', df_train.shape)
print('Test shape: ', df_test.shape)

## Preprocessing

In [None]:
# import re
# from tqdm import tqdm
# tqdm.pandas()

# def clean_text(text):
#     words = word_tokenize(text) # Tokenization
#     tagged_words = pos_tag(words) # POS tagging

#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Removing every character that is not alphanumeric
#     text = re.sub(r'[0-9]+', '#', text) # Replacing every number with '#'
    
#     text = ' '.join([f'{stemmer.stem(tagged_word[0])}_{tagged_word[1]}' for tagged_word in tagged_words]) # Stemming and detokenization
#     return text

# def clean_df(df):
#     df['question_text_cleaned'] = '' # Create a new empty column
#     df['question_text_cleaned'] = df['question_text'].progress_apply(lambda x: clean_text(x))


# clean_df(df_train)

## Word2Vec: Average of word vectors

In [None]:
# from gensim.models import Word2Vec
# from nltk.tokenize import word_tokenize

# train_x = [word_tokenize(question) for question in train_x]
# val_x = [word_tokenize(question) for question in val_x]
# model = Word2Vec(train_x, min_count=4)

# train_x_vectors = [np.zeros(model.wv.vector_size)] * len(train_x)
# val_x_vectors = [np.zeros(model.wv.vector_size)] * len(val_x)

# for i in range(len(train_x)):
#     train_x[i] = [word for word in train_x[i] if word in model.wv.index_to_key]
#     if(train_x[i] != []):
#         train_x_vectors[i] = np.mean(model.wv[train_x[i]], axis=0) 
    
# for i in range(len(val_x)):
#     val_x[i] = [word for word in val_x[i] if word in model.wv.index_to_key]
#     if(val_x[i] != []):
#         val_x_vectors[i] = np.mean(model.wv[val_x[i]], axis=0) 

## TF-IDF

In [None]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)
vectorizer.fit(pd.concat([df_train['question_text'], df_test['question_text']]))


X = vectorizer.transform(df_train['question_text'])
y = df_train['target'].values

X_test = vectorizer.transform(df_test['question_text'])

## NB Features

We took reference from this research paper: https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix

class NBTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=1):
        self.r = None
        self.alpha = alpha

    def fit(self, X, y):
        # store smoothed log count ratio
        p = self.alpha + X[y==1].sum(0)
        q = self.alpha + X[y==0].sum(0)
        self.r = csr_matrix(np.log(
            (p / (self.alpha + (y==1).sum())) /
            (q / (self.alpha + (y==0).sum()))
        ))
        return self

    def transform(self, X, y=None):
        return X.multiply(self.r)

In [None]:
nb_transformer = NBTransformer(alpha=1).fit(X, y)

X_nb = nb_transformer.transform(X)
X_test_nb = nb_transformer.transform(X_test)

## Cross Validation & Testing

### 1. Logistic Regression

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

DATA_SPLIT_SEED = 42
models = []
train_meta = np.zeros(X_nb.shape[0])
test_meta = np.zeros(X_test_nb.shape[0])
splits = list(StratifiedKFold(n_splits=20, shuffle=True, random_state=DATA_SPLIT_SEED).split(df_train, y))
for idx, (train_idx, valid_idx) in enumerate(splits):
    X_train = X_nb[train_idx]
    y_train = y[train_idx]
    X_val = X_nb[valid_idx]
    y_val = y[valid_idx]
    model = LogisticRegression(solver='lbfgs', dual=False, class_weight='balanced', C=0.5, max_iter=100)
    model.fit(X_train, y_train)
    models.append(model)
    valid_pred = model.predict_proba(X_val)
    train_meta[valid_idx] = valid_pred[:,1]
    test_meta += model.predict_proba(X_test_nb)[:,1] / len(splits)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
print(roc_auc_score(y, train_meta))
search_result = threshold_search(y, train_meta)
print(search_result)

In [None]:
df_test.drop(columns=['question_text'], inplace=True)
df_test['target'] = (test_meta > search_result['threshold']).astype(int)
df_test.to_csv('/kaggle/working/submission_lr.csv', index=False)

### 2. Multinomial Naive Bayes

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_nb_scaled = scaler.fit_transform(X_nb.toarray())
X_test_nb_scaled = scaler.transform(X_test_nb.toarray())

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB

DATA_SPLIT_SEED = 42
models = []
train_meta = np.zeros(X_nb.shape[0])
test_meta = np.zeros(X_test_nb.shape[0])
splits = list(StratifiedKFold(n_splits=20, shuffle=True, random_state=DATA_SPLIT_SEED).split(df_train, y))
for idx, (train_idx, valid_idx) in enumerate(splits):
    X_train = X_nb_scaled[train_idx]
    y_train = y[train_idx]
    X_val = X_nb_scaled[valid_idx]
    y_val = y[valid_idx]
    model = MultinomialNB()
    model.fit(X_train, y_train)
    models.append(model)
    valid_pred = model.predict_proba(X_val)
    train_meta[valid_idx] = valid_pred[:,1]
    test_meta += model.predict_proba(X_test_nb_scaled)[:,1] / len(splits)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
print(roc_auc_score(y, train_meta))
search_result = threshold_search(y, train_meta)
print(search_result)

In [None]:
df_test.drop(columns=['question_text'], inplace=True)
df_test['target'] = (test_meta > search_result['threshold']).astype(int)
df_test.to_csv('/kaggle/working/submission_mnb.csv', index=False)

### 3. Linear SVM

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

DATA_SPLIT_SEED = 42
models = []
train_meta = np.zeros(X_nb.shape[0])
test_meta = np.zeros(X_test_nb.shape[0])
splits = list(StratifiedKFold(n_splits=20, shuffle=True, random_state=DATA_SPLIT_SEED).split(df_train, y))
for idx, (train_idx, valid_idx) in enumerate(splits):
    X_train = X_nb[train_idx]
    y_train = y[train_idx]
    X_val = X_nb[valid_idx]
    y_val = y[valid_idx]
    model = LinearSVC(class_weight='balanced')
    model.fit(X_train, y_train)
    models.append(model)
    valid_pred = model.predict(X_val)
    train_meta[valid_idx] = valid_pred
    test_meta += model.predict(X_test_nb) / len(splits)

In [None]:
f1_score(y_true=y, y_pred=train_meta)

In [None]:
df_test.drop(columns=['question_text'], inplace=True)
df_test['target'] = (test_meta).astype(int)
df_test.to_csv('/kaggle/working/submission_svm.csv', index=False)

### 4. Random Forest

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

DATA_SPLIT_SEED = 42
models = []
train_meta = np.zeros(X_nb.shape[0])
test_meta = np.zeros(X_test_nb.shape[0])
splits = list(StratifiedKFold(n_splits=20, shuffle=True, random_state=DATA_SPLIT_SEED).split(df_train, y))
for idx, (train_idx, valid_idx) in enumerate(splits):
    X_train = X_nb[train_idx]
    y_train = y[train_idx]
    X_val = X_nb[valid_idx]
    y_val = y[valid_idx]
    model = RandomForestClassifier(n_estimators=20, min_samples_leaf=20, class_weight='balanced')
    model.fit(X_train, y_train)
    models.append(model)
    valid_pred = model.predict_proba(X_val)
    train_meta[valid_idx] = valid_pred[:,1]
    test_meta += model.predict_proba(X_test_nb)[:,1] / len(splits)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
print(roc_auc_score(y, train_meta))
search_result = threshold_search(y, train_meta)
print(search_result)

In [None]:
df_test.drop(columns=['question_text'], inplace=True)
df_test['target'] = (test_meta > search_result['threshold']).astype(int)
df_test.to_csv('/kaggle/working/submission_rf.csv', index=False)