### tools

In [5]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.linear_model import LogisticRegression as lr
from sklearn.feature_extraction.text import CountVectorizer
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem import *
import csv

import nltk
from nltk import pos_tag
from nltk import word_tokenize
import pandas as pd
from gensim.models import Word2Vec
import math
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer

#### globals

In [9]:
end_symbols = {'?','.'}
pos_idx = dict(ADJ=0, ADP=1, ADV=2, CONJ=3, DET=4, NOUN=5, NUM=6, PRT=7, PRON=8, VERB=9, X=10)

In [21]:
# load word2vec matrix
w2v = Word2Vec.load("emb_tr")
# w2v = KeyedVectors.load_word2vec_format("../../../../../../../GoogleNews-vectors-negative300.bin.gz", binary=True)
vec_size = w2v.vector_size

#### functions

In [10]:
# pre-processing
def extend_abbre(question):
    question = question.lower().replace("won't","will not").replace("n't","not").replace("it's","it is") \
    .replace("'ve"," have").replace("i'm","i am").replace("'re"," are").replace("he's","he is").replace("she's","she is") \
    .replace("'s"," own").replace("what's","what is").replace("+","plus").replace("'ll"," will").replace("'d"," would") \
    .replace("#","sharp").replace("="," equal").replace(",000","000").replace("&","and").replace("|","or")
    return question

def stemmer(train_data):
    stem_tool = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))
    train_data['question1'] = train_data.question1.map(lambda x: ' '.join([stem_tool.stem(word) for word in word_tokenize(extend_abbre(str(x))) if not word in stop_words]))
    train_data['question2'] = train_data.question2.map(lambda x: ' '.join([stem_tool.stem(word) for word in word_tokenize(extend_abbre(str(x))) if not word in stop_words]))
    #now it is bytes in the train_data
    #if want to convert to string using encode('UTF-8')
    return train_data

In [11]:
def dif_length(s1, s2):
    return abs(len(s1) - len(s2))

def dif_word_count(s1, s2):
    return abs(len(s1.split()) - len(s2.split()))

def is_ending_symbol_identical(s1, s2):
    if s1 and s2 and s1[-1] in end_symbols and s2[-1] in end_symbols:
        return int(s1[-1] == s2[-1])
    return 1

def is_first_word_identical(s1, s2):
    if s1 and s2:
        return int(s1.split()[0] ==  s2.split()[0] )
    return 0

def count_shared_words(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1
    ret = 0
    for word in s1.split():
        if word in s2:
            ret += 1
    return ret

#### sentence similarity 

In [24]:
def parts_of_speech(s):
        if type(s) == str:                                                 
                tokens = word_tokenize(s)
        else:
                tokens = s 
#         n = len(tokens)                                                 
        tokens_and_tags = pos_tag(tokens, tagset = "universal")
#         tags = [t for (_,t) in tokens_and_tags]
        return(tokens_and_tags)

pos_idx = dict(ADJ=0, ADP=1, ADV=2, CONJ=3, DET=4, NOUN=5, NUM=6, PRT=7, PRON=8, VERB=9, X=10)

def sent_embedding(pos_li, vec_size):
    pos_groups = [ np.zeros((vec_size)) for i in range(len(pos_idx)) ]
    for word, pos in pos_li:
        if pos != '.':
            if word in w2v.wv:
                pos_groups[pos_idx[pos]] += w2v.wv[word]
    return pos_groups

def sent_cos_similarity(pos_group1, pos_group2):
    ret = []
    for i, _ in enumerate(pos_group1):
#         ret += np.exp(-(np.abs(pos_group1[i] - pos_group2[i])))
#           ret.append(np.dot(pos_group1[i], pos_group2[i])/(np.linalg.norm(pos_group1[i])* np.linalg.norm(pos_group2[i])))
        ret.append((cosine_similarity(pos_group1[i].reshape(1, -1), pos_group2[i].reshape(1, -1)))[0][0])
    return ret
                
# return a vector represents the similarity of two sentence
def cos_similarity_vector(s1, s2, w2v_vec_size):
    pos_li_1 = parts_of_speech(s1)
    pos_li_2 = parts_of_speech(s2)
    
    # assign the words in a sentence into each of the 11 POS groups
    # substitute each word in each POS group with the word's embedding vector
    # sum the vectors in each POS group of a sentence
    # so we will have a list of 11 vectors for each sentence
    
    emb_list_1 = sent_embedding(pos_li_1, w2v_vec_size)
    emb_list_2 = sent_embedding(pos_li_2, w2v_vec_size)
    # compute the cosine_similarity of the corresponding groups in the two sentences
    # so we will have a list of 11 cosine_similarity measures.
    
    return sent_cos_similarity(emb_list_1, emb_list_2)

#### category_similarity

In [13]:
def load_news_dataset():
    twenty_train = fetch_20newsgroups(subset='train', \
                                      remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=22) 
    twenty_test = fetch_20newsgroups(subset='test', \
                                     remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=22)
    return twenty_train, twenty_test

def extract_text_features(train_data, min_docs):
    
    sw = set(stopwords.words('english'))
    
    cv = CountVectorizer(stop_words = sw, min_df = min_docs)
    X_train_counts = cv.fit_transform(train_data)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    return X_train_tfidf, cv, tfidf_transformer

def extract_text_features_from_test_data(test_data, min_docs, cv, tfidf_transformer):
    X_test_counts = cv.transform(test_data)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    
    return X_test_tfidf
    
def train_multinomialNB(X_train, Y_train):
    clf = MultinomialNB().fit(X_train, Y_train) 
    return clf

def train_LR(X_train, Y_train):
    learner = lr()
    learner.fit(X_train, Y_train)
    return learner

twenty_train, twenty_test = load_news_dataset()
X_train_tfidf, cv, tfidf_transformer = extract_text_features(twenty_train.data, 1)
category_lr = train_LR(X_train_tfidf, twenty_train.target)

#### read training data

In [6]:
train_data = pd.read_csv('data/train.csv')
cleaned = stemmer(train_data)

#### create matrix of attributes

In [38]:
# doc_limit = 404000
doc_limit = 10000
feature_count = 3+len(pos_idx)+1+1;
Xtr = np.zeros((doc_limit, feature_count))
Ytr = []


# with open('data/train.csv','r',  encoding='utf-8') as tr_data:
#     tr_csv = csv.reader(tr_data)
#     next(tr_csv)
for i, row in enumerate(cleaned["question1"]):
    q_left = row
    q_right = cleaned["question2"][i];

    if type(q_left) == str and type(q_right) == str:
        cos_sim = cos_similarity_vector(q_left, q_right, vec_size)
        nz = [i for i in cos_sim if i > 0]
        if len(nz) == 0:
            cos_sim_avg = 0
        else:
            cos_sim_avg = sum(cos_sim)/(len(nz))

        left_cat = category_lr.predict(extract_text_features_from_test_data([q_left], 1, cv, tfidf_transformer))
        right_cat = category_lr.predict(extract_text_features_from_test_data([q_right], 1, cv, tfidf_transformer))

        Xtr[i,:] = np.array([is_ending_symbol_identical(q_left,q_right), is_first_word_identical(q_left,q_right),\
                              count_shared_words(q_left,q_right), cos_sim_avg, \
                             int(left_cat == right_cat)]+cos_sim)
#         if type(q_left) == str and type(q_right) == str:
#             Xtr[i,:] = np.array([dif_length(q_left,q_right), dif_word_count(q_left,q_right),\
#                                   is_ending_symbol_identical(q_left,q_right), is_first_word_identical(q_left,q_right),\
#                                   count_shared_words(q_left,q_right)])
        Ytr.append(row[-1]) 

    if i >= doc_limit-1:
        break;

#### Train and predict with logistic regression

In [39]:
training_percentage = .8
sep = int(doc_limit*training_percentage)

xtr, ytr, xte, yte = Xtr[:sep,:],Ytr[:sep], Xtr[sep:,:], Ytr[sep:]

learner = lr()
learner.fit(xtr, ytr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
predicted_LR = learner.predict(xte)

#### print result

In [41]:
# a matching pair means a correct prediction
#print('result',list(zip(predicted_LR,yte)))
print('accuracy:', sum([int(l==r) for l,r in zip(predicted_LR,yte)]), "/" ,len(predicted_LR), "=", \
      sum([int(l==r) for l,r in zip(predicted_LR,yte)])/len(predicted_LR))

accuracy: 1974 / 2000 = 0.987


In [42]:
from sklearn.model_selection import cross_val_score

In [43]:
scores = cross_val_score(learner, Xtr, Ytr, cv = 10)
print(scores.mean())



0.9901579725542419


In [48]:
test_data = pd.read_csv('data/test.csv')
cleaned_test = stemmer(test_data)

In [None]:
col_1 = cleaned_test["question1"]
col_2 = cleaned_test["question2"]
doc_count = len(col_1)

Xte = np.zeros((doc_count, feature_count))

for i, _ in enumerate(col_1):
    q_left = col_1[i]
    q_right = col_2[i]
    
    if type(q_left) == str and type(q_right) == str:
            cos_sim = cos_similarity_vector(q_left, q_right, vec_size)
            nz = [i for i in cos_sim if i > 0]
            if len(nz) == 0:
                cos_sim_avg = 0
            else:
                cos_sim_avg = sum(cos_sim)/(len(nz))

            left_cat = category_lr.predict(extract_text_features_from_test_data([q_left], 1, cv, tfidf_transformer))
            right_cat = category_lr.predict(extract_text_features_from_test_data([q_right], 1, cv, tfidf_transformer))

            Xte[i,:] = np.array([is_ending_symbol_identical(q_left,q_right), is_first_word_identical(q_left,q_right),\
                                  count_shared_words(q_left,q_right), cos_sim_avg, \
                                 int(left_cat == right_cat)]+cos_sim)
    
    

In [259]:
predicted_kaggle = learner.predict(Xte)

In [260]:
myData = ["test_id", "is_duplicate"]
 
myFile = open('kaggle_submit1.csv', 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerow(myData)
    for i, result in enumerate(predicted_kaggle):
        writer.writerow([i, result])
print("Writing complete")

Writing complete
