# ALTEGRAD - Detection of questions with same meaning

This notebook loads the data, extract features, classify the test set and save it to 'submissions.csv', a file to be submitted on Kaggle.

## Definition of the problem

The problem is a set of pairs of questions, where the output to predict is whether both questions in a pair have the same meaning or not.

Let's first import some libraries and define functions for score computations, cross-validation, etc.

In [1]:
import numpy as np
import unidecode

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV


from tools import *

## Load the data from CSV files

The data is first loaded from CSV files, with all texts saved in a dictionary with IDs as keys, and pairs storing pairs of text IDs.

In [13]:
texts = {}
pairs_train = []
pairs_test = []
y_train = []

train_path = 'train.csv'
test_path = 'test.csv'

texts = {}
nb_lines = None
pairs_train, y_train = read_csv(train_path, texts, nb_lines = nb_lines)
pairs_test = read_csv(test_path, texts, labelled = False, nb_lines = nb_lines)




        
preprocess_texts(texts)

## Feature extraction

In [14]:
from collections import Counter
from itertools import chain

def get_vocab(lst):
    print(w for txt in lst for w in lst[txt].split())
    vocabcount = Counter(w for txt in lst for w in lst[txt].split())
    vocab = map(lambda x: x[0], sorted(vocabcount.items(), key=lambda x: -x[1]))
    return list(vocab), vocabcount

vocab, vocabcount = get_vocab(texts)

print(vocab[:50])
print('...',len(vocab))

print(len(texts))

<generator object get_vocab.<locals>.<genexpr> at 0x7effe2eb7e60>
['the', 'what', 'is', 'how', 'i', 'to', 'a', 'do', 'in', 'are', 'of', 'can', 'and', 'for', 'you', 'best', 'whi', 'my', 'on', 'it', 'doe', 'some', 'get', 'which', 's', 'should', 'be', 'have', 'your', 'that', 'with', 'from', 'india', 'or', 'an', 'way', 'if', 'at', 'peopl', 'quora', 'who', 'will', 'good', 'make', 'learn', 'like', 'use', 'most', 'when', 'know']
... 14412
58940


In [6]:
def tfidf(texts):
    ids2ind = {} # will contain the row idx of each unique text in the TFIDF matrix 
    for qid in texts:
        ids2ind[qid] = len(ids2ind)

    vec = TfidfVectorizer(ngram_range=(1, 3))
    A = vec.fit_transform(texts.values())

    return ids2ind, A


def compute_features(pairs, A, ids2ind):
    N = len(pairs)
    X = np.zeros((N, 3))
    for i in range(len(pairs)):
        q1 = pairs[i][0]
        q2 = pairs[i][1]
        X[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
        X[i,1] = len(texts[q1].split()) + len(texts[q2].split())
        X[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))

    return N, X


In [4]:

 
ids2ind, A = tfidf(texts)

N_train, X_train = compute_features(pairs_train, A, ids2ind)
N_test, X_test = compute_features(pairs_test, A, ids2ind)

## Classification

In [5]:
clf = MLPClassifier((10))
#RandomForestClassifier(n_estimators=500, max_depth = 4, n_jobs=-1)
clf.fit(X_train, y_train)

print_score(clf, X_train, y_train, cv = 7)

[1m[93mCV Fold 0[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.62	[92m0.62[0m
[1m[93mtest		[94m0.67	[92m0.67[0m

[1m[93mCV Fold 1[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.67	[92m0.64[0m
[1m[93mtest		[94m0.67	[92m0.64[0m

[1m[93mCV Fold 2[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.67	[92m0.64[0m
[1m[93mtest		[94m0.67	[92m0.62[0m

[1m[93mCV Fold 3[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.66	[92m0.69[0m
[1m[93mtest		[94m0.64	[92m0.66[0m

[1m[93mCV Fold 4[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.36	[92m1.72[0m
[1m[93mtest		[94m0.36	[92m2.36[0m

[1m[93mCV Fold 5[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.67	[92m0.63[0m
[1m[93mtest		[94m0.57	[92m0.70[0m

[1m[93mCV Fold 6[0m
[1m[93m		[94macc	[92mloss[0m
[1m[93mtrain		[94m0.66	[92m0.64[0m
[1m[93mtest		[94m0.69	[92m0.62[0m

[1m[93mBagged scores[0m
[1m[93mtrain



In [6]:
y_pred = clf.predict_proba(X_test)

sub_path = 'submissions.csv'
save_submission(sub_path, y_pred)

References