# Predict tags on StackOverflow with linear models
starter code https://github.com/hse-aml/natural-language-processing/blob/master/week1/week1-MultilabelClassification.ipynb

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
from ast import literal_eval
import pandas as pd
import numpy as np
import re

# Data preparation

In [8]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [9]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t')

In [10]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [78]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

# Text preparation

In [22]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower()# lowercase text    
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)# delete stopwords from text
    return text

In [24]:
def test_text_prepare():
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    for ex, ans in zip(examples, answers):
        if text_prepare(ex) != ans:
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [26]:
print((test_text_prepare()))

Basic tests are passed.


In [27]:
prepared_questions  = []
for line in open('data/text_prepare_tests.tsv', encoding='utf-8'):
    line = text_prepare(line.strip())
    prepared_questions.append(line)
text_prepare_results = '\n'.join(prepared_questions)

In [29]:
print(text_prepare_results)

sqlite php readonly
creating multiple textboxes dynamically
self one prefer javascript
save php date string mysql database timestamp
fill dropdownlist data xml file aspnet application
programmatically trigger jqueryui draggables drag event
get value method argument via reflection java
knockout mapingfromjs observablearray json object data gets lost
facebook connect localhost weird stuff
fullcalendar prev next click
syntaxerror unexpected token
effective way float double comparison
gem install rails fails dns error
listshuttle component richfaces getting updated
laravel responsedownload show images laravel
wrong rspec test
calendar display using java swing
python selenium import regular firefox profile addons
random number 2 variables values
altering http responses firefox extension
start session python web application
align radio buttons horizontally django forms
count number rows sqlite database
wordpress wp_rewrite rules
removing sheet excel 2005 using php
php fatal error function na

In [77]:
X_train = [text_prepare (x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [33]:
print(X_train[:5])

['draw stacked dotplot r', 'mysql select records datetime field less specified value', 'terminate windows phone 81 app', 'get current time specific country via jquery', 'configuring tomcat use ssl']


## Words tags count

In [79]:
from collections import defaultdict
tags_counts = defaultdict(int)
words_counts = defaultdict(int)

for tags in y_train:
    for tag in tags:
        tags_counts[tag] += 1
for text in X_train:
    for word in text.split():
        words_counts[word] += 1

In [37]:
sorted_tags_counts = sorted(tags_counts.items(), key=lambda x:x[1], reverse=True)
sorted_words_counts = sorted(words_counts.items(), key=lambda x:x[1], reverse=True)

In [38]:
print(sorted_tags_counts[:3])
print(sorted_words_counts[:3])

[('javascript', 19078), ('c#', 19077), ('java', 18661)]
[('using', 8278), ('php', 5614), ('java', 5501)]


## Bag of words

In [45]:
from scipy import sparse as sp_sparse

In [61]:
DICT_SIZE = 5000
sorted_words = sorted(words_counts.keys(), key=lambda x:words_counts[x], reverse=True)[:DICT_SIZE]
WORD2IDX = {word: i for i, word in enumerate(sorted_words)}
print(sorted_words[:3])
print(WORD2IDX['using'], WORD2IDX['php'], WORD2IDX['java'])

['using', 'php', 'java']
0 1 2


In [62]:
def my_bag_of_words(text, word2idx=WORD2IDX, dict_size=DICT_SIZE):
    text_vector = np.zeros(dict_size)
    for word in text.split():
        if word in word2idx:
            idx = word2idx[word]
            text_vector[idx] += 1
    return text_vector

In [65]:
# slkearn algorithms can work only with csr matrix.
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (100000, 5000)
X_val shape  (30000, 5000)
X_test shape  (20000, 5000)


## TF-IDF

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [109]:
def tfidf_features(X_train, X_val, X_test):
    vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    X_train_tf = vectorizer.fit_transform(X_train)
    X_val_tf = vectorizer.transform(X_val)
    X_test_tf = vectorizer.transform(X_test)
    return X_train_tf, X_val_tf, X_test_tf, vectorizer.vocabulary_

In [110]:
X_train_tf, X_val_tf, X_test_tf, tf_voc = tfidf_features(X_train, X_val, X_test)

In [42]:
tf_idx = {i: word for word, i in tf_voc.items()}

In [44]:
print(tf_voc['c++'], tf_idx[451])

451 c++


In [112]:
X_train_tf

<100000x26875 sparse matrix of type '<class 'numpy.float64'>'
	with 1143626 stored elements in Compressed Sparse Row format>

In [111]:
X_val_tf

<30000x26875 sparse matrix of type '<class 'numpy.float64'>'
	with 335689 stored elements in Compressed Sparse Row format>

# Multilabel classifier

In [68]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer

In [80]:
mlb = MultiLabelBinarizer(classes=list(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [83]:
def train_classifier(X_train, y_train):
    clf1 = OneVsRestClassifier(LogisticRegression())
    clf2 = OneVsRestClassifier(RidgeClassifier(normalize=True))
    #clf3 = OneVsRestClassifier(SVC(max_iter=1000))
    
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    #clf3.fit(X_train, y_train)
    
    return clf1, clf2#, clf3

In [None]:
clf_lr_bag, clf_rid_bag = train_classifier(X_train_mybag, y_train)

In [85]:
y_val_pred_lr_bag = clf_lr_bag.predict(X_val_mybag)
y_val_scores_lr_bag = clf_lr_bag.decision_function(X_val_mybag)
y_val_pred_rig_bag = clf_rid_bag.predict(X_val_mybag)
y_val_scores_rig_bag = clf_rid_bag.decision_function(X_val_mybag)

In [115]:
clf_lr_tf, clf_rid_tf = train_classifier(X_train_tf, y_train)

In [116]:
y_val_pred_lr_tf = clf_lr_tf.predict(X_val_tf)
y_val_scores_lr_tf = clf_lr_tf.decision_function(X_val_tf)
y_val_pred_rig_tf = clf_rid_tf.predict(X_val_tf)
y_val_scores_rig_tf = clf_rid_tf.decision_function(X_val_tf)

# Evaluation

In [93]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [97]:
def print_evaluation_scores(y_val, predicted):
    print('accuracy: ', accuracy_score(y_val, predicted))
    #print('recall: ', recall_score(y_val, predicted))
    print('f1: ', f1_score(y_val, predicted, average='weighted'))
    print('average precision: ', average_precision_score(y_val, predicted))

In [117]:
words = ['bag of words', 'tfidf']
models = ['Logistic regression', 'Ridge regression']
results = [[y_val_pred_lr_bag, y_val_pred_rig_bag], [y_val_pred_lr_tf, y_val_pred_rig_tf]]

for i, word in enumerate(words):
    for j, model in enumerate(models):
        predicted = results[i][j]
        print(word, model)
        print_evaluation_scores(y_val, predicted)
        print()
    

bag of words Logistic regression
accuracy:  0.358
f1:  0.6486950381244107
average precision:  0.34458812912520126

bag of words Ridge regression
accuracy:  0.0251
f1:  0.4915224575665367
average precision:  0.1872312471750414

tfidf Logistic regression
accuracy:  0.2859333333333333


  'precision', 'predicted', average, warn_for)


f1:  0.556296777033464
average precision:  0.24803337193846897

tfidf Ridge regression
accuracy:  0.2722333333333333
f1:  0.6562375844646402
average precision:  0.36123654179448983



In [118]:
y_val_pred_inversed = mlb.inverse_transform(y_val_pred_lr_bag)
y_val_inversed = mlb.inverse_transform(y_val)
for i in [10, 20, 30]:
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	C# properties and how to access their values from another function/class?
True labels:	c#,class,.net
Predicted labels:	c#


Title:	PHP equivalent of JavaScript getTime()
True labels:	php,javascript,datetime
Predicted labels:	php,javascript


Title:	Ajax data - Uncaught ReferenceError: date is not defined
True labels:	javascript,jquery,ajax
Predicted labels:	javascript,jquery,ajax


