In [1]:
import pandas as pd
import nltk
nltk.download('brown')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

[nltk_data] Downloading package brown to /home/indix/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
from nltk.corpus import brown
import logging, gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = brown.sents()
model = gensim.models.Word2Vec(sentences, min_count=1)

2017-05-19 14:00:00,563 : INFO : 'pattern' package not found; tag filters are not available for English
2017-05-19 14:00:00,592 : INFO : collecting all words and their counts
2017-05-19 14:00:00,623 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-19 14:00:01,491 : INFO : PROGRESS: at sentence #10000, processed 219770 words, keeping 23488 word types
2017-05-19 14:00:02,202 : INFO : PROGRESS: at sentence #20000, processed 430477 words, keeping 34367 word types
2017-05-19 14:00:03,134 : INFO : PROGRESS: at sentence #30000, processed 669056 words, keeping 42365 word types
2017-05-19 14:00:03,946 : INFO : PROGRESS: at sentence #40000, processed 888291 words, keeping 49136 word types
2017-05-19 14:00:04,525 : INFO : PROGRESS: at sentence #50000, processed 1039920 words, keeping 53024 word types
2017-05-19 14:00:04,953 : INFO : collected 56057 word types from a corpus of 1161192 raw words and 57340 sentences
2017-05-19 14:00:04,954 : INFO : Loading a fresh v

In [5]:
from gensim import models
sentences = []
i=0
for sent in brown.sents():
    sentences.append(models.doc2vec.LabeledSentence(words=sent,tags=["SENT_"+str(i)]))
    i = i+1

class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for uid, line in enumerate(open(filename)):
            yield LabeledSentence(words=line.split(), labels=['SENT_%s' % uid])
            
model2 = models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1)
model2.build_vocab(sentences)

2017-05-19 14:02:09,871 : INFO : collecting all words and their counts
2017-05-19 14:02:09,872 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-05-19 14:02:09,972 : INFO : PROGRESS: at example #10000, processed 219770 words (2200738/s), 23488 word types, 10000 tags
2017-05-19 14:02:10,069 : INFO : PROGRESS: at example #20000, processed 430477 words (2204804/s), 34367 word types, 20000 tags
2017-05-19 14:02:10,171 : INFO : PROGRESS: at example #30000, processed 669056 words (2350862/s), 42365 word types, 30000 tags
2017-05-19 14:02:10,272 : INFO : PROGRESS: at example #40000, processed 888291 words (2194950/s), 49136 word types, 40000 tags
2017-05-19 14:02:10,456 : INFO : PROGRESS: at example #50000, processed 1039920 words (830753/s), 53024 word types, 50000 tags
2017-05-19 14:02:10,520 : INFO : collected 56057 word types and 57340 unique tags from a corpus of 57340 examples and 1161192 words
2017-05-19 14:02:10,521 : INFO : Loading a fresh vocabular

In [6]:
from math import*
from decimal import Decimal

def euclidean_distance(x,y):
    return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

def manhattan_distance(x,y):
    return sum(abs(a-b) for a,b in zip(x,y))


def nth_root(value, n_root):
    root_value = 1/float(n_root)
    return round (Decimal(value) ** Decimal(root_value),3)

def minkowski_distance(x,y,p_value):
    return nth_root(sum(pow(abs(a-b),p_value) for a,b in zip(x, y)),
       p_value)

def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [80]:
df = pd.read_csv('train.csv')
# df = df[0:50]
df.fillna(" ")
df.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')

In [94]:

df[df.question2.isnull() == True]['question2'].map(lambda x: type(x) == float)

105780    True
201841    True
Name: question2, dtype: bool

In [95]:
import math, numpy as np
euc_dist = []
man_dist = []
mink_dist = []
cos_sim = []
jac_sim = []

for index, row in df.iterrows():
    sent1 = row.question1
    sent2 = row.question2
    if type(sent2) == float:
        sent2 = "the"
    words1 = list(filter(lambda x: x.isalnum() and x not in stop, nltk.word_tokenize(sent1.lower())))
    words2 = list(filter(lambda x: x.isalnum() and x not in stop, nltk.word_tokenize(sent2.lower())))
    cos_sim.append(model2.docvecs.similarity_unseen_docs(model2, words1, words2))
    v1 = model2.infer_vector(words1)
    v2 = model2.infer_vector(words2)
    euc_dist.append(euclidean_distance(v1, v2))
    man_dist.append(manhattan_distance(v1, v2))
    mink_dist.append(minkowski_distance(v1, v2, 2))
    if len(set(words1).union(words2)) > 0:
        jac = len(set(words1).intersection(words2))/len(set(words1).union(words2))
    else:
        jac = 0
    jac_sim.append(jac)

df['euc_dist'] = euc_dist
df['man_dist'] = man_dist
df['mink_dist'] = mink_dist
df['cos_sim'] = cos_sim
df['jac_sim'] = jac_sim

In [98]:
df.to_csv("~/Desktop/sims.csv")

In [96]:
df.shape

(404290, 11)

In [99]:
X = df[['euc_dist', 'man_dist', 'mink_dist', 'cos_sim', 'jac_sim']][:400000]
Y = df['is_duplicate'][:400000]

test_X = df[['euc_dist', 'man_dist', 'mink_dist', 'cos_sim', 'jac_sim']][400000:]
test_Y = df['is_duplicate'][400000:]

In [100]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [101]:
predict = logistic.predict(test_X)
# predict

In [106]:
logistic.score(X,Y)

0.65260499999999999

In [103]:
testdf = df[9000:]
test_X_0 = testdf[testdf.is_duplicate == 0][['euc_dist', 'man_dist', 'mink_dist', 'cos_sim', 'jac_sim']]
test_Y_0 = testdf[testdf.is_duplicate == 0]['is_duplicate']
test_X_1 = testdf[testdf.is_duplicate == 1][['euc_dist', 'man_dist', 'mink_dist', 'cos_sim', 'jac_sim']]
test_Y_1 = testdf[testdf.is_duplicate == 1]['is_duplicate']

In [104]:
logistic.score(test_X_0,test_Y_0)

0.80882659608598007

In [105]:
logistic.score(test_X_1,test_Y_1)

0.3856780648256013

In [110]:
resultdf = pd.read_csv('/home/indix/Downloads/test.csv')

resultdf.shape

(2345796, 3)

In [112]:
# resultdf.fillna("the")

In [114]:
import math, numpy as np
euc_dist_res = []
man_dist_res = []
mink_dist_res = []
cos_sim_res = []
jac_sim_res = []

for index, row in resultdf.iterrows():
    sent1 = row.question1
    sent2 = row.question2
    if type(sent1) == float:
        sent1 = "sample"
    if type(sent2) == float:
        sent2 = "the"
    words1 = list(filter(lambda x: x.isalnum() and x not in stop, nltk.word_tokenize(sent1.lower())))
    words2 = list(filter(lambda x: x.isalnum() and x not in stop, nltk.word_tokenize(sent2.lower())))
    cos_sim_res.append(model2.docvecs.similarity_unseen_docs(model2, words1, words2))
    v1 = model2.infer_vector(words1)
    v2 = model2.infer_vector(words2)
    euc_dist_res.append(euclidean_distance(v1, v2))
    man_dist_res.append(manhattan_distance(v1, v2))
    mink_dist_res.append(minkowski_distance(v1, v2, 2))
    if len(set(words1).union(words2)) > 0:
        jac = len(set(words1).intersection(words2))/len(set(words1).union(words2))
    else:
        jac = 0
    jac_sim_res.append(jac)

resultdf['euc_dist'] = euc_dist_res
resultdf['man_dist'] = man_dist_res
resultdf['mink_dist'] = mink_dist_res
resultdf['cos_sim'] = cos_sim_res
resultdf['jac_sim'] = jac_sim_res

In [115]:
resultdf[:10]

Unnamed: 0,test_id,question1,question2,euc_dist,man_dist,mink_dist,cos_sim,jac_sim
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0.039049,0.312637,0.039,0.080707,0.272727
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0.041766,0.33854,0.042,-0.007132,0.5
2,2,What but is the best way to send money from Ch...,What you send money to China?,0.036787,0.299649,0.037,0.115144,0.5
3,3,Which food not emulsifiers?,What foods fibre?,0.040799,0.327214,0.041,0.022168,0.0
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0.040635,0.331065,0.041,-0.026136,0.666667
5,5,How are the two wheeler insurance from Bharti ...,I admire I am considering of buying insurance ...,0.041142,0.339251,0.041,0.095676,0.125
6,6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?,0.037078,0.3138,0.037,0.1044,0.428571
7,7,"By scrapping the 500 and 1000 rupee notes, how...",How will the recent move to declare 500 and 10...,0.042422,0.355001,0.042,-0.066016,0.222222
8,8,What are the how best books of all time?,What are some of the military history books of...,0.043524,0.351365,0.044,-0.102452,0.4
9,9,After 12th years old boy and I had sex with a ...,Can a 14 old guy date a 12 year old girl?,0.039247,0.323294,0.039,0.036039,0.214286


In [117]:
resultdf.to_csv("~/Desktop/result.csv")

In [118]:
rX = resultdf[['euc_dist', 'man_dist', 'mink_dist', 'cos_sim', 'jac_sim']]

In [119]:
result_predict = logistic.predict(rX)

In [121]:
result_predict.shape

(2345796,)

In [122]:
resultdf['is_duplicate'] = result_predict

In [124]:
final = resultdf[['test_id', 'is_duplicate']]

In [125]:
final.to_csv("/home/indix/Desktop/sub.csv")

In [126]:
resultdf[resultdf.is_duplicate == 1]

Unnamed: 0,test_id,question1,question2,euc_dist,man_dist,mink_dist,cos_sim,jac_sim,is_duplicate
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0.040635,0.331065,0.041,-0.026136,0.666667,1
10,10,What is the best slideshow app for Android?,What are the best app for android?,0.040736,0.331826,0.041,0.035404,0.750000,1
17,17,Can a vacuum cleaner concentrate suck your eye...,Could a vacuum cleaner suck get your eye out i...,0.039617,0.316948,0.040,0.002001,0.600000,1
40,40,Is web development just building websites? bes...,Is web development just building websites? Can...,0.041926,0.338228,0.042,-0.094836,0.769231,1
41,41,Which age is the best age to what get married?,What is the best age to get for a woman?,0.040643,0.324277,0.041,0.024171,0.600000,1
46,46,How does first time sex hasn feel?,How sex first time?,0.042513,0.350149,0.043,-0.098955,0.750000,1
47,47,How dry I make my website?,How can make website?,0.046647,0.389598,0.047,-0.181478,0.666667,1
59,59,How do I become a data scientist in Malaysia?,How can I become a data scientist?,0.035183,0.276110,0.035,0.264926,0.750000,1
66,66,How do I or NIT rourkela CSE?,Do you NIT Rourkela?,0.036611,0.302276,0.037,0.083570,0.666667,1
67,67,What die good gifts for a foreign visitor to b...,What are good gifts for a foreign visitor to b...,0.041226,0.324036,0.041,-0.024259,0.642857,1


In [128]:
import sense2vec

ModuleNotFoundError: No module named 'sense2vec'