In [35]:
import os
import sys
from pathlib import Path
from tqdm import tqdm

import pandas as pd
import numpy as np
from scipy.sparse import hstack

import string
import spacy
import contractions

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import hamming_loss, f1_score, jaccard_score

from lightgbm import LGBMClassifier

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=16)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils import utils

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# config variables and constants
dataset_path = Path("../datasets")
dataset_path.mkdir(exist_ok=True)

In [42]:
def clean_text(text):
    """ CLean question body and title by lemmanizing, removing stopword and puctuations """
    text = nlp(contractions.fix(text))
    new_tokens = []
    for tok in text:
        lemmanized_tok = tok.lemma_.lower().strip()
        if lemmanized_tok in  binarizer.classes_:
            new_tokens.append(lemmanized_tok)
        elif tok.lemma_ != "-PRON-" and tok.is_stop != True and tok.is_punct != True and tok.is_digit != True:
            new_tokens.append(lemmanized_tok)
            
    return ' '.join(new_tokens)

def transform_tf_idf(x):
    """ Tranasform a document to tf-idf representation """
    vectorizer = TfidfVectorizer(analyzer = 'word',
                                 encoding = 'utf-8',
                                 ngram_range = (1, 2),
                                 max_features=1000)
    return vectorizer.fit_transform(x)

def output_score(clf, y_pred, y_test):
    """ Return evaluation metrics as dataframe """
    performance = { 
                   "Jaccard":jaccard_score(y_test, y_pred, average="weighted"), 
                   "Humming": hamming_loss(y_test, y_pred)*100,
                   "F1": f1_score(y_test, y_pred, average="weighted"),
                  }
    return pd.DataFrame(performance, index=[clf.__class__.__name__])
    
def train_test_model(clf, X_train, X_test, y_train, y_test):
    """ Train and test a classifier """
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return output_score(clf, y_pred, y_test)

In [4]:
df_full = pd.read_parquet(dataset_path/"cleaned_df.parquet")

In [5]:
df_full

Unnamed: 0,Tag,BodyCleaned,TitleCleaned
0,"sql,asp.net",Has anyone got experience creating SQL-based A...,ASP.NET Site Maps
1,algorithm,This is something I've pseudo-solved many time...,Function for creating color wheels
2,"c#,.net",I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...
3,c++,I am working on a collection of classes used f...,Should I use nested classes in this case?
4,".net,web-services",I've been writing a few web services for a .ne...,Homegrown consumption of web services
...,...,...,...
965707,javascript,"I'm trying to detect the ""flash out of date"" e...","YouTube iFrame API: no ready call, no error call"
965708,"python,bash",I need to extend a shell script (bash). As I a...,How to execute multiline python code from a ba...
965709,"php,.htaccess",I am building a custom MVC project and I have ...,URL routing in PHP (MVC)
965710,android,Under minifyEnabled I changed from false to tr...,Obfuscating code in android studio


## Text/target pre-processing

First, we need to encode our target variable for classification task. We can use binary label generator which creates a binary representation for each label.

In [6]:
tags = df_full["Tag"].apply(lambda x: (x.split(',')))
binarizer =  MultiLabelBinarizer()
labels = binarizer.fit_transform(tags)
df_full["labels"] = list(labels)
df_full

Unnamed: 0,Tag,BodyCleaned,TitleCleaned,labels
0,"sql,asp.net",Has anyone got experience creating SQL-based A...,ASP.NET Site Maps,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,algorithm,This is something I've pseudo-solved many time...,Function for creating color wheels,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"c#,.net",I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,c++,I am working on a collection of classes used f...,Should I use nested classes in this case?,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,".net,web-services",I've been writing a few web services for a .ne...,Homegrown consumption of web services,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
965707,javascript,"I'm trying to detect the ""flash out of date"" e...","YouTube iFrame API: no ready call, no error call","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
965708,"python,bash",I need to extend a shell script (bash). As I a...,How to execute multiline python code from a ba...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
965709,"php,.htaccess",I am building a custom MVC project and I have ...,URL routing in PHP (MVC),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
965710,android,Under minifyEnabled I changed from false to tr...,Obfuscating code in android studio,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


For the baseline model, which will use a basic encoding schema like TF-IDF vectorization for text body and title, further cleaning needed like:
- Text case lowering
- Text processing like stopword removal, lemmanization, punctuation removal while keeping target label words intact

Those steps are fundamental for baseline model because model quality heavily depending on the text pre-processing quality

In [7]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.4.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [8]:
nlp = spacy.load("en_core_web_lg")

# configure pipeline
nlp.select_pipes(disable=['ner', 'parser', 'tok2vec'])

['ner', 'parser', 'tok2vec']

In [9]:
df_full['BodyCleanedPost'] = df_full['BodyCleaned'].parallel_apply(lambda x: clean_text(x))
df_full['TitleCleanedPost'] = df_full['TitleCleaned'].parallel_apply(lambda x: clean_text(x)) 

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=60357), Label(value='0 / 60357')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=60357), Label(value='0 / 60357')))…

In [13]:
utils.compare_body(df_full, before_col="BodyCleaned", after_col="BodyCleanedPost")

*******Raw text: BodyCleaned*******
I'm writing a custom reactor for twisted and it needs some cleaning to do when it has to stop.
I tried overriding the stop method like this:
def stop(self):
    posixbase.PosixReactorBase.stop(self)
    #cleanup code here

It does, however, seem to not always be called. When I run trial like this python -m cProfile /usr/bin/trial -r custom tests/ | grep "stop" the only stop methods that are called are:
    2    0.000    0.000    0.000    0.000 abstract.py:397(stopReading)
    1    0.000    0.000    0.000    0.000 abstract.py:405(stopWriting)
    1    0.000    0.000    0.000    0.000 log.py:691(stop)
    1    0.000    0.000    0.000    0.000 protocol.py:678(stopProtocol)
    3    0.000    0.000    0.000    0.000 reporter.py:97(stopTest)
    3    0.000    0.000    0.000    0.000 result.py:79(stopTest)
    1    0.000    0.000    0.000    0.000 udp.py:218(stopListening)
*******Cleaned text:BodyCleanedPost *******
writing custom reactor twisted needs clea

In [14]:
utils.compare_body(df_full, before_col="TitleCleaned", after_col="TitleCleanedPost")

*******Raw text: TitleCleaned*******
AngularJS nested ng-repeat onclick
*******Cleaned text:TitleCleanedPost *******
angularjs nested ng repeat onclick
##########

*******Raw text: TitleCleaned*******
Is there a better way of doing this in mysql? - update entire column with another select and group by
*******Cleaned text:TitleCleanedPost *******
better way mysql update entire column select group
##########

*******Raw text: TitleCleaned*******
Android recieving classcastexception despite being in correct class
*******Cleaned text:TitleCleanedPost *******
android recieving classcastexception despite correct class
##########



## Baseline Model

For the baseline model, we will using TF-IDF vectorizer to embed text with different classifier. Lets first split the data to train/test before TF-IDF trasnformation to avoid leakage.

In [15]:
x_train_title, x_test_title, y_train, y_test = train_test_split(df_full["TitleCleanedPost"], df_full["labels"], test_size=0.1, random_state = 0)
x_train_body, x_test_body, y_train, y_test = train_test_split(df_full["BodyCleanedPost"], df_full["labels"], test_size=0.1, random_state = 0)

In [18]:
vectorized_title_train = transform_tf_idf(x_train_title)
vectorized_title_test = transform_tf_idf(x_test_title)

vectorized_body_train = transform_tf_idf(x_train_body)
vectorized_body_test = transform_tf_idf(x_test_body)

we can stack body and title tf-idf vectors to get a combined representation of both part.

In [29]:
vectorized_stacked_train = hstack([vectorized_title_train, vectorized_body_train])
vectorized_stacked_test = hstack([vectorized_title_test, vectorized_body_test])
y_train = np.stack(y_train.values)
y_test = np.stack(y_test.values)

In [30]:
vectorized_stacked_train.shape

(869140, 2000)

We can test different classifiers on top of our TF-IDF embeddings. For multi-label classifications, there are different paradigms to make a classifier to support multi-label, some of them:
- OnevsRest (for multilabel problem, same with Binary Relevance): One classifier is fitted for each class to model the problem as binary classification problem. Disadvantage is that it assumes that the labels are mutually exclusive and could not get the interconnections between labels.
- Classifier Chains: A stack of C binary classifiers (C is number of classes)trained sequentually with the output of previous classifier, disadvantage: quality depending on the order of labels, not suitable for large number of labels.
- Label Powerset: Assing a class for each of label combination and use multi-class approach, disadvantage: with large label combinatorial space, model is underfitting and performace is poor.

We can use the simplest and most popular approach: OnevsRest

In [None]:
scores = []
models = [#OneVsRestClassifier(LogisticRegression(class_weight='balanced'), n_jobs=-1),
          OneVsRestClassifier(SGDClassifier(class_weight='balanced', loss='modified_huber'), n_jobs=-1),
          KNeighborsClassifier(n_jobs=-1),
          #RandomForestClassifier(class_weight='balanced', n_jobs=-1),
          OneVsRestClassifier(LGBMClassifier(is_unbalance=True), n_jobs=-1)
         ]

for model in tqdm(models):
    score = train_test_model(model, vectorized_stacked_train, vectorized_stacked_test,  y_train, y_test)
    scores.append(score)

scores = pd.concat(scores)
scores

 50%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 2/4 [10:47<12:23, 371.83s/it]