In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/8f/e9/c2b4c823b3959d475a570c1bd2df4125478e2e37b96fb967a87933ae7134/transformers-4.18.0-py3-none-any.whl (4.0MB)
[K     |████████████████████████████████| 4.0MB 573kB/s 
[?25hCollecting numpy>=1.17
[?25l  Downloading https://files.pythonhosted.org/packages/14/32/d3fa649ad7ec0b82737b92fefd3c4dd376b0bb23730715124569f38f3a08/numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl (14.8MB)
[K     |████████████████████████████████| 14.8MB 39.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/28/78/fef8d089db5b97546fd6d1ff2e813b8544e85670bf3a8c378c9d0250b98d/sacremoses-0.0.53.tar.gz (880kB)
[K     |████████████████████████████████| 880kB 40.7MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
[?25l  Downloading https://files.pythonhosted.org/packages/36/22/26b08c841c0493908b4be6960ec2be14a21d1ec0f42ae0cedbca5599ad3d/tokenizers-0.12.1-cp36-cp36m-manylinux_2_12_x86_6

In [2]:
import transformers

In [3]:
import sys
import glob
import torch


import os
import re
import gc
import pickle  
import random
import string

import numpy as np
import pandas as pd
from scipy import stats

# import transformers
from transformers import DistilBertTokenizer,DistilBertModel
import math


from scipy.stats import spearmanr, rankdata
from os.path import join as path_join
from numpy.random import seed
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold

seed(42)
random.seed(42)

import nltk
from nltk.corpus import stopwords

from sklearn.base import clone
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder, RobustScaler, KBinsDiscretizer, QuantileTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold, GroupKFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, RANSACRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import ExtraTreesRegressor

eng_stopwords = set(stopwords.words("english"))

import tensorflow as tf
import tensorflow_hub as hub

In [4]:
# settings
data_dir = '../input/google-quest-challenge/'
metas_dir = ''
sub_dir = ''

RANDOM_STATE = 42

import datetime
todate = datetime.date.today().strftime("%m%d")


# Functions

In [5]:
# count words
def word_count(xstring):
    return xstring.split().str.len()


def spearman_corr(y_true, y_pred):
        if np.ndim(y_pred) == 2:
            corr = np.mean([stats.spearmanr(y_true[:, i], y_pred[:, i])[0] for i in range(y_true.shape[1])])
        else:
            corr = stats.spearmanr(y_true, y_pred)[0]
        return corr
    
custom_scorer = make_scorer(spearman_corr, greater_is_better=True)

In [6]:
def chunks(l, n):

    for i in range(0, len(l), n):
        yield l[i:i + n]

In [7]:
def fetch_vectors(string_list, batch_size=64):
    # inspired by https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
    DEVICE = torch.device("cuda")
    tokenizer = transformers.DistilBertTokenizer.from_pretrained("../input/distilbertbaseuncased/")
    model = transformers.DistilBertModel.from_pretrained("../input/distilbertbaseuncased/")
    model.to(DEVICE)

    fin_features = []
    for data in chunks(string_list, batch_size):
        tokenized = []
        for x in data:
            x = " ".join(x.strip().split()[:300])
            tok = tokenizer.encode(x, add_special_tokens=True)
            tokenized.append(tok[:512])

        max_len = 512
        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])
        attention_mask = np.where(padded != 0, 1, 0)
        input_ids = torch.tensor(padded).to(DEVICE)
        attention_mask = torch.tensor(attention_mask).to(DEVICE)

        with torch.no_grad():
            last_hidden_states = model(input_ids, attention_mask=attention_mask)

        features = last_hidden_states[0][:, 0, :].cpu().numpy()
        fin_features.append(features)

    fin_features = np.vstack(fin_features)
    return fin_features

# Data

In [8]:
# load the data

xtrain = pd.read_csv(data_dir + 'train.csv')
xtest = pd.read_csv(data_dir + 'test.csv')


In [9]:
xtrain.head(4)

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0


In [10]:
target_cols = ['question_asker_intent_understanding', 'question_body_critical', 
               'question_conversational', 'question_expect_short_answer', 
               'question_fact_seeking', 'question_has_commonly_accepted_answer', 
               'question_interestingness_others', 'question_interestingness_self', 
               'question_multi_intent', 'question_not_really_a_question', 
               'question_opinion_seeking', 'question_type_choice', 
               'question_type_compare', 'question_type_consequence', 
               'question_type_definition', 'question_type_entity', 
               'question_type_instructions', 'question_type_procedure', 
               'question_type_reason_explanation', 'question_type_spelling', 
               'question_well_written', 'answer_helpful', 
               'answer_level_of_information', 'answer_plausible', 
               'answer_relevance', 'answer_satisfaction', 
               'answer_type_instructions', 'answer_type_procedure', 
               'answer_type_reason_explanation', 'answer_well_written']

# EDA / FE

## Basic FE

In [11]:
# word count in title, body and answer
for colname in ['question_title', 'question_body', 'answer']:
    newname = colname + '_word_len'
    
    xtrain[newname] = xtrain[colname].str.split().str.len()
    xtest[newname] = xtest[colname].str.split().str.len()

    
del newname, colname

In [12]:
for colname in ['question', 'answer']:

    # check for nonames, i.e. users with logins like user12389
    xtrain['is_'+colname+'_no_name_user'] = xtrain[colname +'_user_name'].str.contains('^user\d+$') + 0
    xtest['is_'+colname+'_no_name_user'] = xtest[colname +'_user_name'].str.contains('^user\d+$') + 0
    

colname = 'answer'
# check lexical diversity (unique words count vs total )
xtrain[colname+'_div'] = xtrain[colname].apply(lambda s: len(set(s.split())) / len(s.split()) )
xtest[colname+'_div'] = xtest[colname].apply(lambda s: len(set(s.split())) / len(s.split()) )


In [13]:
## domain components
for df in [xtrain, xtest]:
    
    df['domcom'] = df['question_user_page'].apply(lambda s: s.split('://')[1].split('/')[0].split('.'))
    # count components
    df['dom_cnt'] = df['domcom'].apply(lambda s: len(s))
    # pad the length in case some domains have fewer components in the name
    df['domcom'] = df['domcom'].apply(lambda s: s + ['none', 'none'])

    # components
    for ii in range(0,4):
        df['dom_'+str(ii)] = df['domcom'].apply(lambda s: s[ii])
    
# clean up
xtrain.drop('domcom', axis = 1, inplace = True)
xtest.drop('domcom', axis = 1, inplace = True)

In [14]:
# shared elements
for df in [xtrain, xtest]:
    df['q_words'] = df['question_body'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords] )
    df['a_words'] = df['answer'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords] )
    df['qa_word_overlap'] = df.apply(lambda s: len(np.intersect1d(s['q_words'], s['a_words'])), axis = 1)
    df['qa_word_overlap_norm1'] = df.apply(lambda s: s['qa_word_overlap']/(1 + len(s['a_words'])), axis = 1)
    df['qa_word_overlap_norm2'] = df.apply(lambda s: s['qa_word_overlap']/(1 + len(s['q_words'])), axis = 1)
    df.drop(['q_words', 'a_words'], axis = 1, inplace = True)



In [15]:
for df in [xtrain, xtest]:
    
    ## Number of characters in the text ##
    df["question_title_num_chars"] = df["question_title"].apply(lambda x: len(str(x)))
    df["question_body_num_chars"] = df["question_body"].apply(lambda x: len(str(x)))
    df["answer_num_chars"] = df["answer"].apply(lambda x: len(str(x)))

    ## Number of stopwords in the text ##
    df["question_title_num_stopwords"] = df["question_title"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
    df["question_body_num_stopwords"] = df["question_body"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
    df["answer_num_stopwords"] = df["answer"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

    ## Number of punctuations in the text ##
    df["question_title_num_punctuations"] =df['question_title'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
    df["question_body_num_punctuations"] =df['question_body'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
    df["answer_num_punctuations"] =df['answer'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

    ## Number of title case words in the text ##
    df["question_title_num_words_upper"] = df["question_title"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df["question_body_num_words_upper"] = df["question_body"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df["answer_num_words_upper"] = df["answer"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))


## FE - distance-based 

In [16]:
module_url = "../input/universalsentenceencoderlarge4/"
embed = hub.load(module_url)

In [17]:
embeddings_train = {}
embeddings_test = {}
for text in ['question_title', 'question_body', 'answer']:
    train_text = xtrain[text].str.replace('?', '.').str.replace('!', '.').tolist()
    test_text = xtest[text].str.replace('?', '.').str.replace('!', '.').tolist()
    
    curr_train_emb = []
    curr_test_emb = []
    batch_size = 4
    ind = 0
    while ind*batch_size < len(train_text):
        curr_train_emb.append(embed(train_text[ind*batch_size: (ind + 1)*batch_size])["outputs"].numpy())
        ind += 1
        
    ind = 0
    while ind*batch_size < len(test_text):
        curr_test_emb.append(embed(test_text[ind*batch_size: (ind + 1)*batch_size])["outputs"].numpy())
        ind += 1    
        
    embeddings_train[text + '_embedding'] = np.vstack(curr_train_emb)
    embeddings_test[text + '_embedding'] = np.vstack(curr_test_emb)

    print(text)
    
del embed

question_title
question_body
answer


In [18]:
l2_dist = lambda x, y: np.power(x - y, 2).sum(axis=1)

cos_dist = lambda x, y: (x*y).sum(axis=1)

dist_features_train = np.array([
    l2_dist(embeddings_train['question_title_embedding'], embeddings_train['answer_embedding']),
    l2_dist(embeddings_train['question_body_embedding'], embeddings_train['answer_embedding']),
    l2_dist(embeddings_train['question_body_embedding'], embeddings_train['question_title_embedding']),
    cos_dist(embeddings_train['question_title_embedding'], embeddings_train['answer_embedding']),
    cos_dist(embeddings_train['question_body_embedding'], embeddings_train['answer_embedding']),
    cos_dist(embeddings_train['question_body_embedding'], embeddings_train['question_title_embedding'])
]).T

dist_features_test = np.array([
    l2_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding']),
    cos_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding'])
]).T

del embeddings_train, embeddings_test

In [19]:
for ii in range(0,6):
    xtrain['dist'+str(ii)] = dist_features_train[:,ii]
    xtest['dist'+str(ii)] = dist_features_test[:,ii]
    

# Model

## Pipeline buildup

In [20]:
limit_char = 5000
limit_word = 25000

In [21]:
title_col = 'question_title'
title_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase = False, max_df = 0.3, min_df = 1,
                             binary = False, use_idf = True, smooth_idf = False,
                             ngram_range = (1,2), stop_words = 'english', 
                             token_pattern = '(?u)\\b\\w+\\b' , max_features = limit_word ))
])

        
title_transformer2 = Pipeline([
 ('tfidf2',  TfidfVectorizer( sublinear_tf=True,
    strip_accents='unicode', analyzer='char',
    stop_words='english', ngram_range=(1, 4), max_features= limit_char))   
])


body_col = 'question_body'
body_transformer = Pipeline([
    ('tfidf',TfidfVectorizer(lowercase = False, max_df = 0.3, min_df = 1,
                             binary = False, use_idf = True, smooth_idf = False,
                             ngram_range = (1,2), stop_words = 'english', 
                             token_pattern = '(?u)\\b\\w+\\b' , max_features = limit_word ))
])


body_transformer2 = Pipeline([
 ('tfidf2',  TfidfVectorizer( sublinear_tf=True,
    strip_accents='unicode', analyzer='char',
    stop_words='english', ngram_range=(1, 4), max_features= limit_char))   
])

answer_col = 'answer'

answer_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase = False, max_df = 0.3, min_df = 1,
                             binary = False, use_idf = True, smooth_idf = False,
                             ngram_range = (1,2), stop_words = 'english', 
                             token_pattern = '(?u)\\b\\w+\\b' , max_features = limit_word ))
])

answer_transformer2 = Pipeline([
 ('tfidf2',  TfidfVectorizer( sublinear_tf=True,
    strip_accents='unicode', analyzer='char',
    stop_words='english', ngram_range=(1, 4), max_features= limit_char))   
])

num_cols = [
    'question_title_word_len', 'question_body_word_len', 'answer_word_len', 'answer_div',
    'question_title_num_chars','question_body_num_chars','answer_num_chars',
    'question_title_num_stopwords','question_body_num_stopwords','answer_num_stopwords',
    'question_title_num_punctuations','question_body_num_punctuations','answer_num_punctuations',
    'question_title_num_words_upper','question_body_num_words_upper','answer_num_words_upper',
    'dist0', 'dist1', 'dist2', 'dist3', 'dist4',       'dist5'
]

num_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('scale', PowerTransformer(method='yeo-johnson'))
])


cat_cols = [
    'dom_0', 
    'dom_1', 
    'dom_2', 
    'dom_3',     
    'category', 
    'is_question_no_name_user',
    'is_answer_no_name_user',
    'dom_cnt'
]

cat_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers = [
        ('title', title_transformer, title_col),
        ('title2', title_transformer2, title_col),
        ('body', body_transformer, body_col),
        ('body2', body_transformer2, body_col),
        ('answer', answer_transformer, answer_col),
        ('answer2', answer_transformer2, answer_col),
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('estimator',Ridge(random_state=RANDOM_STATE))
])

## Find best parameters

In [22]:
# prep
id_train = xtrain['qa_id']
ytrain = xtrain[target_cols]
xtrain.drop(target_cols + ['qa_id'], axis = 1, inplace = True)

id_test = xtest['qa_id'] 
xtest.drop('qa_id', axis = 1, inplace = True)

In [23]:
dropcols = ['question_user_name', 'question_user_page',
 'answer_user_name', 'answer_user_page','url','host']

xtrain.drop(dropcols, axis = 1, inplace = True)
xtest.drop(dropcols, axis = 1, inplace = True)


## Folds

In [24]:
nfolds = 5
mvalid = np.zeros((xtrain.shape[0], len(target_cols)))
mfull = np.zeros((xtest.shape[0], len(target_cols)))

kf = GroupKFold(n_splits= nfolds).split(X=xtrain.question_body, groups=xtrain.question_body)


In [25]:
 
for ind, (train_index, test_index) in enumerate(kf):
    

    # split
    x0, x1 = xtrain.loc[train_index], xtrain.loc[test_index]
    y0, y1 = ytrain.loc[train_index], ytrain.loc[test_index]

    for ii in range(0, ytrain.shape[1]):

        # fit model
        be = clone(pipeline)
#        be.steps[1][1].alpha = vector_as.loc[ii]
        be.fit(x0, np.array(y0)[:,ii])

        filename = 'ridge_f' + str(ind) + '_c' + str(ii) + '.pkl'
        pickle.dump(be, open(filename, 'wb'))
        
        # park forecast
        mvalid[test_index, ii] = be.predict(x1)
        mfull[:,ii] += be.predict(xtest)/nfolds
        
    print('---')

---
---
---
---
---


## Performance

In [26]:
corvec = np.zeros((ytrain.shape[1],1))
for ii in range(0, ytrain.shape[1]):
    mvalid[:,ii] = rankdata(mvalid[:,ii])/mvalid.shape[0]
    mfull[:,ii] = rankdata(mfull[:,ii])/mfull.shape[0]
    
    corvec[ii] = stats.spearmanr(ytrain[ytrain.columns[ii]], mvalid[:,ii])[0]
    
print(corvec.mean())

0.3041240343355675


# Submission

In [27]:
prval = pd.DataFrame(mvalid)
prval.columns = ytrain.columns
prval['qa_id'] = id_train
prval = prval[['qa_id'] + list(prval.columns[:-1])]
prval.to_csv(metas_dir + 'prval_ridge_'+todate+ '.csv', index = False)


prfull = pd.DataFrame(mfull)
prfull.columns = ytrain.columns
prfull['qa_id'] = id_test
prfull = prfull[['qa_id'] + list(prfull.columns[:-1])]
prfull.to_csv(metas_dir + 'prfull_ridge_'+todate+ '.csv', index = False)

prfull.to_csv(sub_dir + 'submission.csv', index = False)