# ALTEGRAD - Detection of questions with same meaning

This notebook loads the data, extract features, classify the test set and save it to 'submissions.csv', a file to be submitted on Kaggle.

## Definition of the problem

The problem is a set of pairs of questions, where the output to predict is whether both questions in a pair have the same meaning or not.

Let's first import some libraries and define functions for score computations, cross-validation, etc.

In [78]:
import numpy as np
import unidecode

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

from sklearn.model_selection import GridSearchCV

import string
import nltk
from nltk.stem.snowball import SnowballStemmer

# for displaying score
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'


In [113]:
def loss(y, p):
    N = y.shape[0]
    l = 0
    ignored = 0
    for i in range(N):
        if (y[i] == 0 and p[i][1] == 1.) or (y[i] == 1 and p[i][1] == 0.):
            ignored += 1
        else:
            if y[i] == 0:
                l -= np.log(1 - p[i][1])
            else:
                l -= np.log(p[i][1])
    return l / (N - ignored)


def compute_score(clf, X, y):
    scorer = make_scorer(loss, greater_is_better = True, needs_proba = True)
    return scorer(clf, X, y)


def compute_cv_score(clf, X, y, cv = 5):
    loss_scorer = make_scorer(loss, greater_is_better = True, needs_proba = True)
    scoring = {'acc': 'accuracy',
               'loss': loss_scorer}
    scores = cross_validate(clf, X, y, scoring=scoring,
                             cv=cv, return_train_score=True)  
    
    return scores


def html_color(text, color, bold=True):
    b = ''
    if bold:
        b = 'font-weight: bold;'
    return '<span style="color = ' + color + ';' + b + '">' + str(text) + '</span>'

def print_line(text, acc, loss, number = True):
    if number:
        acc = '%0.2f' % acc
        loss = '%0.2f' % loss
    print(color.BOLD +
              color.YELLOW + text + "\t\t" + 
              color.BLUE + acc + "\t" + 
              color.GREEN + loss + color.END)
    
def print_score(clf, X, y, cv = 5):
    cv_scores = compute_cv_score(clf, X, y, cv = cv)
    acc_color = "blue"
    loss_color = "green"
    layout_color = "yellow"
    
    for i in range(cv):
        print(color.BOLD + color.YELLOW + "CV Fold %i" % i + color.END)
        print_line("", "acc", "loss", False)
        print_line("train", cv_scores['train_acc'][i], cv_scores['train_loss'][i])
        print_line("test", cv_scores['test_acc'][i], cv_scores['test_loss'][i])
        print()
        
        
    print(color.BOLD + color.YELLOW + "Bagged scores" + color.END)
    print("Bagged scores") 
    print("\t\tacc\tloss")
    print_line("train", np.mean(cv_scores['train_acc'], np.mean(cv_scores['train_loss'][i])))
    print_line("test", np.mean(cv_scores['test_acc']), np.mean(cv_scores['test_loss']))
    
def save_submission(sub_path, y):
    with open(sub_path, 'w') as f:
        f.write("Id,Score\n")
        for i in range(y_pred.shape[0]):
            f.write(str(i)+','+str(y_pred[i][1])+'\n')

## Load the data from CSV files

The data is first loaded from CSV files, with all texts saved in a dictionary with IDs as keys, and pairs storing pairs of text IDs.

In [23]:
def read_csv(path, texts, nb_lines = None, labelled = True):
    pairs = []
    y = []
    read_lines = 0
    
    with open(path,'r') as f:
        for line in f:
            if nb_lines != None and read_lines >= nb_lines:
                break
            read_lines += 1
            
            l = line.split(',')
            if l[1] not in texts:
                texts[l[1]] = l[3]
            if l[2] not in texts:
                if labelled:
                    texts[l[2]] = l[4]
                else:
                    texts[l[2]] = l[4][:-1]                    

            pairs.append([l[1],l[2]])

            if labelled:
                y.append(int(l[5][:-1])) # [:-1] is just to remove formatting at the end

    if labelled:
        return pairs, np.array(y)
    else:
        return pairs

In [24]:
texts = {}
pairs_train = []
pairs_test = []
y_train = []

train_path = 'train.csv'
test_path = 'test.csv'

texts = {}
nb_lines = 100
pairs_train, y_train = read_csv(train_path, texts, nb_lines = nb_lines)
pairs_test = read_csv(test_path, texts, labelled = False, nb_lines = nb_lines)

## Feature extraction

In [22]:
stemmer = SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
stopwords = map(str, stopwords)

def tfidf(texts):
    ids2ind = {} # will contain the row idx of each unique text in the TFIDF matrix 
    for qid in texts:
        ids2ind[qid] = len(ids2ind)

    vec = TfidfVectorizer(ngram_range=(1, 3))
    A = vec.fit_transform(texts.values())

    return ids2ind, A


def compute_features(pairs, A, ids2ind):
    N = len(pairs)
    X = np.zeros((N, 3))
    for i in range(len(pairs)):
        q1 = pairs[i][0]
        q2 = pairs[i][1]
        X[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
        X[i,1] = len(texts[q1].split()) + len(texts[q2].split())
        X[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))

    return N, X

    
def preprocess_line(line, stemmer = stemmer, stopwords = stopwords):
    strip_punct = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    line = line.lower().translate(strip_punct)

    l = line.split(" ")
    l = [w for w in l if w not in stopwords]
    l = " ".join(map(stemmer.stem, line.split(" ")))
    
    return l


def preprocess_texts(texts):
    for i in texts.keys():
        texts[i] = preprocess_line(texts[i])        

In [9]:
preprocess_texts(texts)
 
ids2ind, A = tfidf(texts)

N_train, X_train = compute_features(pairs_train, A, ids2ind)
N_test, X_test = compute_features(pairs_test, A, ids2ind)

## Classification

In [114]:
clf = RandomForestClassifier(n_estimators=50, max_depth = 4, n_jobs=-1)
clf.fit(X_train, y_train)

print_score(clf, X_train, y_train, cv = 2)

[1m[93mCV Fold 0[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.86	[92m0.38[0m
[1m[93mtest		[94m0.60	[92m0.73[0m

[1m[93mCV Fold 1[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.90	[92m0.40[0m
[1m[93mtest		[94m0.66	[92m0.71[0m

[1m[93mBagged scores[0m
Bagged scores
		acc	loss


TypeError: tuple indices must be integers or slices, not numpy.float64

In [91]:
y_pred = clf.predict_proba(X_test)

sub_path = 'submissions.csv'
save_submission(sub_path, y_pred)

References

In [92]:
from termcolor import colored

colored("hello", "red")

'\x1b[31mhello\x1b[0m'

In [93]:

html_print("<span style='font-weight:bold; color: red;'>test</span>")

In [94]:
html_print('<span style="color = yellow;font-weight: bold;">train		</span><span style="color = blue;font-weight: bold;">0.2f</span><span style="color = green;font-weight: bold;">0.2f</span>')

In [95]:
def color_in_tokens(tokens, color_token_contains="_"):
  """
  Highlights the tokens which contain 'color_token_contains'

  :param tokens: list of strings
  :param color_token_contains: str (the string for marking a token red)
  :return: str
  """
  return " ".join(["\x1b[31m%s\x1b[0m" % i if color_token_contains in i else i for i in tokens])

print(color_in_tokens(['a','b_','c']))

a [31mb_[0m c


In [88]:
a = 0.222
s = '%0.2f' % a
s

'0.22'