### Cosine Similarity for different documents

In [1]:
import pandas as pd
import numpy as np
import gensim.utils as utils
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

##### Read data

In [2]:
columns = ['index','text', 'tag']
def add_row(text, classes, df):
    df.loc[len(df)] = [len(df), utils.to_utf8(text, errors='replace').decode("utf8"), classes]

In [3]:
import textract
from os import listdir
from os.path import isfile, join


data = pd.DataFrame( columns=columns)
data

dirpath = 'data/train/shortstory/'
shortstory = [add_row(textract.process(dirpath + f),'shortstory', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/java/'
java = [add_row(textract.process(dirpath + f), 'technical', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/python/'
python = [add_row(textract.process(dirpath + f),'technical', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/medicine/'
medicine = [add_row(textract.process(dirpath + f),'medicine', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/mobilescreen/'
mobilescreen = [add_row(textract.process(dirpath + f),'mobile', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/mobilememorycard/'
mobilememorycard = [add_row(textract.process(dirpath + f),'mobile', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/lcd/'
mobilememorycard = [add_row(textract.process(dirpath + f),'lcd', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/hippa/'
mobilememorycard = [add_row(textract.process(dirpath + f),'hippa', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/srs/'
mobilememorycard = [add_row(textract.process(dirpath + f),'srs', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]



In [4]:
columns = ['index','text', 'tag']
test = pd.DataFrame( columns=columns)
dirpath = 'data/test/medicine/'
[add_row(textract.process(dirpath + f),'medicine', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/mobilescreen/'
[add_row(textract.process(dirpath + f),'mobile', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/mobilememorycard/'
[add_row(textract.process(dirpath + f),'mobile', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/lcd/'
[add_row(textract.process(dirpath + f),'lcd', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/java/'
[add_row(textract.process(dirpath + f),'technical', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/hippa/'
[add_row(textract.process(dirpath + f),'hippa', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/mobile/'
[add_row(textract.process(dirpath + f),'mobile', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/srs/'
[add_row(textract.process(dirpath + f),'srs', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]


[None, None, None]

##### Tokenizers for different data types

In [71]:
def verify_tokenize_xml(text):
#    text = '<guestbook><guest><fname>Terje</fname><lname>Beck</lname></guest><guest><fname>Jan</fname><lname>Refsnes</lname></guest><guest><fname>Torleif</fname><lname>Rasmussen</lname></guest>'
    xml_pattern = '<([^<>]+)>([^<>]*)<(\/[^<>]+)>'
    if re.search(xml_pattern, text):
        text = re.sub('<',' <', text)
        text = re.sub('>','> ', text)
        return re.split(' |\n', text) 
    else :
        return []


In [72]:
def verify_tokenize_hippa(text):
#    text = 'ISA*00*          *00*          *ZZ*EMEDNYBAT      *ZZ*ETIN           *100101*1000*^*00501*006000600*0*T*:~GS*HP*EMEDNYBAT*ETIN*20100101*1050*6000600*X*005010X221A1~ST*835*1740~BPR*H*0*C*NON************20100101~TRN*1*10100000000*1000000000~REF*EV*ETIN~DTM*405*20100101~N1*PR*NYSDOH~N3*OFFICE OF HEALTH INSURANCE PROGRAMS*CORNING TOWER, EMPIRE STATE PLAZA~N4*ALBANY*NY*122370080~PER*BL*PROVIDER SERVICES*TE*8003439000*UR*www.emedny.org~N1*PE*MAJOR MEDICAL PROVIDER*XX*9999999995~REF*TJ*000000000~LX*1~CLP*PATIENT ACCOUNT NUMBER*2*34*0**MC*1000220000000030*11~NM1*QC*1*SUBMITTED LAST*SUBMITTED FIRST****MI*LL88888L~NM1*74*1*CORRECTED LAST*CORRECTED FIRST~REF*EA*PATIENT ACCOUNT NUMBER~DTM*232*20100101~DTM*233*20100101~SVC*HC:V2020*12*0**0~DTM*472*20100101~CAS*CO*29*12~SVC*HC:V2103*22*0**0~DTM*472*20100101~CAS*CO*29*22~SE*25*1740~GE*1*6000600~IEA*1*006000600~'
    hippa_pattern = '[[^a-zA-Z0-9]*/*[[^a-zA-Z0-9., -_]*/*[^a-zA-Z0-9., -_]*]*~]*'    
    if re.match(hippa_pattern, text)    :
        text = re.sub('\*','\* ', text)
        text = re.sub('~',' ~ ', text)
        return re.split(' |\n', text) 
    else :
        return []
  

In [73]:
def verify_tokenize_log(text):
#    text = "172.16.0.3 - - [25/Sep/2002:14:04:19 +0200] \"GET / HTTP/1.1\" 401 - \"\" \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.1) Gecko/20020827"
#    text = "192.168.2.32 - - [10/Feb/2017:13:02:23 +0530] \"GET / HTTP/1.0\" 200 45"
    log_pattern1 = r"(.*): (.*): (.*)"
    log_pattern2 = r"((\[[^\]]+\]) (\[[^\]]+\]) (\[[^\]]+\]) (.*): (.*))"
#    log_pattern3 = r"\"(.*?)\"|\[(.*?)\]|(\S+)"
    log_pattern5 = r"(.*?)-\[(.*?)\](.*)"
    log_pattern4 = r"^(\[[^\]]+\]) (\[[^\]]+\]) (\[[^\]]+\]) (.*)$"
    pattern = "|".join([log_pattern1, log_pattern2, log_pattern4, log_pattern5])

    if re.search(pattern, text):
        return re.split(pattern + "| |\n", text) 
    else :
        return []

verify_tokenize_log('Hi This is I am')

[]

In [74]:
import nltk
from nltk.corpus import stopwords
cashedstopwords = stopwords.words('english')
cashedstopwords.extend(["\'",',','.','...', 'i\'m', 'n\'t','\'m',''])

def tokenize_text(text):
#    filtered = [w for w in get_tokens(nltk.word_tokenize(text.lower())) if not w in cashedstopwords]
    review_text = re.sub("[^a-zA-Z0-9]"," ", text)
    words = review_text.lower().split()
    words = [w for w in words if not w in cashedstopwords]
    return words


In [75]:
def review_to_wordlist( text):
    words = verify_tokenize_xml(text)
    if len(words) == 0:
        words = verify_tokenize_hippa(text)
        if len(words) == 0:
            words = verify_tokenize_log(text)
            if len(words) == 0:
                review_text = re.sub("[^a-zA-Z0-9]"," ", text)
                words = review_text.lower().split()
                words = [w for w in words if not w in cashedstopwords]
    return([word for word in words if word is not None ])


In [76]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
import string

printable = set(string.printable)

def review_to_sentences( text):
    text = filter(lambda x: x in printable, text.strip())
    raw_sentences = tokenizer.tokenize(text)
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.extend( review_to_wordlist( raw_sentence ))
    return sentences

##### Term Frequency using CountVectorize - Cosine Similarity


In [77]:
vectorizer = CountVectorizer(tokenizer=review_to_sentences,  analyzer = "word",   \
                             preprocessor = None, \
                             stop_words = None) 
train_data_features = vectorizer.fit_transform(data['text'])

In [78]:
pairwise_similarity = cosine_similarity(train_data_features)
print(pairwise_similarity.shape)
dist = 1 - pairwise_similarity
print(dist.shape)

(116, 116)
(116, 116)


In [80]:
for i in range(len(pairwise_similarity)):
    for j in range(len(pairwise_similarity)):
        if( pairwise_similarity[i][j] > 0.6):
            print(i , j,  pairwise_similarity[i][j], data['tag'][i], data['tag'][j])

(0, 0, 1.0000000000000155, 'shortstory', 'shortstory')
(0, 1, 0.66550071448199644, 'shortstory', 'shortstory')
(0, 2, 0.63067213902944208, 'shortstory', 'shortstory')
(0, 3, 0.62927429400527113, 'shortstory', 'shortstory')
(0, 5, 0.68384797283727783, 'shortstory', 'shortstory')
(0, 6, 0.69449758671910788, 'shortstory', 'shortstory')
(0, 9, 0.60481769896147286, 'shortstory', 'shortstory')
(1, 0, 0.66550071448199644, 'shortstory', 'shortstory')
(1, 1, 0.99999999999997635, 'shortstory', 'shortstory')
(1, 2, 0.66355526366197926, 'shortstory', 'shortstory')
(1, 3, 0.72255794365432158, 'shortstory', 'shortstory')
(1, 5, 0.74182290906846249, 'shortstory', 'shortstory')
(1, 6, 0.77417140882202362, 'shortstory', 'shortstory')
(1, 8, 0.60512632809738498, 'shortstory', 'shortstory')
(1, 9, 0.62376298376350636, 'shortstory', 'shortstory')
(2, 0, 0.63067213902944208, 'shortstory', 'shortstory')
(2, 1, 0.66355526366197926, 'shortstory', 'shortstory')
(2, 2, 0.99999999999998079, 'shortstory', 'shorts

##### Term Frequency using CountVectorize - Cosine Similarity

In [81]:
stop_words = stopwords.words("english")  
vectorizer = TfidfVectorizer( min_df=0.15, max_df=0.2, tokenizer=review_to_sentences, 
                             stop_words=stop_words, lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=True)

train_data_features = vectorizer.fit_transform(data['text'].str.lower())

In [82]:
train_data_features.shape

(116, 1006)

In [83]:
pairwise_similarity = cosine_similarity(train_data_features)
print(pairwise_similarity.shape)
dist = 1 - pairwise_similarity
print(dist.shape)

(116, 116)
(116, 116)


In [85]:
for i in range(len(pairwise_similarity)):
    for j in range(len(pairwise_similarity)):
        if( pairwise_similarity[i][j] > 0.6 and i != j):
            print(i , j,  pairwise_similarity[i][j], data['tag'][i], data['tag'][j])

(0, 1, 0.69630642233848838, 'shortstory', 'shortstory')
(0, 2, 0.70720766180967509, 'shortstory', 'shortstory')
(0, 3, 0.65840718323412561, 'shortstory', 'shortstory')
(0, 5, 0.68548549129755454, 'shortstory', 'shortstory')
(0, 6, 0.69164610062197218, 'shortstory', 'shortstory')
(0, 8, 0.63975304268292821, 'shortstory', 'shortstory')
(0, 9, 0.62271600454542952, 'shortstory', 'shortstory')
(1, 0, 0.69630642233848838, 'shortstory', 'shortstory')
(1, 2, 0.68612306662679823, 'shortstory', 'shortstory')
(1, 3, 0.79754384259729383, 'shortstory', 'shortstory')
(1, 4, 0.60884721724825808, 'shortstory', 'shortstory')
(1, 5, 0.83990873945867883, 'shortstory', 'shortstory')
(1, 6, 0.87502328471925628, 'shortstory', 'shortstory')
(1, 8, 0.62047338352472503, 'shortstory', 'shortstory')
(1, 9, 0.67415629275265598, 'shortstory', 'shortstory')
(2, 0, 0.70720766180967509, 'shortstory', 'shortstory')
(2, 1, 0.68612306662679823, 'shortstory', 'shortstory')
(2, 3, 0.64450292448456126, 'shortstory', 'short

In [86]:
test_vector_features = vectorizer.transform(test.text)

In [87]:
test_similarity = cosine_similarity(test_vector_features, train_data_features)

In [88]:
test_similarity.shape

(30, 116)

In [89]:
max(test_similarity[0])

0.28160892067130783

In [90]:
for row in range(len(test_similarity)):
    print(test.tag[row], test_similarity[row][test_similarity[row].argmax()], data.tag[test_similarity[row].argmax()])

('medicine', 0.28160892067130783, 'lcd')
('medicine', 0.3264902881038278, 'medicine')
('medicine', 0.3279560874334661, 'medicine')
('medicine', 0.2243675586581832, 'mobile')
('medicine', 0.29840640894894116, 'medicine')
('medicine', 0.99737932985280975, 'medicine')
('mobile', 0.52420535792838574, 'mobile')
('mobile', 0.33918958986805348, 'mobile')
('mobile', 0.47303192314922526, 'mobile')
('mobile', 0.35181005699549156, 'technical')
('mobile', 0.5855545552280903, 'mobile')
('mobile', 0.50325686923745672, 'mobile')
('mobile', 0.24394214011619797, 'mobile')
('mobile', 0.20744370493104908, 'mobile')
('lcd', 0.35888398229643048, 'lcd')
('lcd', 0.42959703353783679, 'lcd')
('lcd', 0.29827008927396043, 'mobile')
('lcd', 0.48200513596024336, 'lcd')
('technical', 0.36524896774816068, 'technical')
('hippa', 1.0000000000000002, 'hippa')
('hippa', 1.0, 'hippa')
('hippa', 1.0, 'hippa')
('hippa', 1.0, 'hippa')
('hippa', 1.0000000000000002, 'hippa')
('mobile', 0.23346183469117593, 'medicine')
('mobil

In [106]:
test.new_text[26]

[u'mobile',
 u'unit',
 u'locating',
 u'system',
 u'description',
 u'june',
 u'30',
 u'1970',
 u'filed',
 u'feb',
 u'13',
 u'1969',
 u'moorehead',
 u'etal',
 u'mobile',
 u'unit',
 u'locating',
 u'system',
 u'2',
 u'sheets',
 u'sheet',
 u'1',
 u'23',
 u'w',
 u'second',
 u'l5',
 u'unit',
 u'1',
 u'thir',
 u'receiving',
 u'receiving',
 u'station',
 u'1',
 u'station',
 u'5',
 u'retransmitter',
 u'53',
 u'j',
 u'rei',
 u'ransmi',
 u'iter',
 u'l',
 u'recfehirsitng',
 u'station',
 u'auxiliary',
 u'20',
 u'receiver',
 u'3617',
 u'reset',
 u'36',
 u'360',
 u'36',
 u'u',
 u'programmer',
 u'380a',
 u'digital',
 u'l',
 u'inter',
 u'face',
 u'3',
 u'computer',
 u'x',
 u'8',
 u'32',
 u'34',
 u'l',
 u'g',
 u'display',
 u'inventors',
 u'dan',
 u'w',
 u'patterson',
 u'donovan',
 u'lmoorehead',
 u'attorneys',
 u'june',
 u'30',
 u'1970',
 u'l',
 u'moorehead',
 u'et',
 u'al',
 u'3',
 u'518',
 u'674',
 u'mobile',
 u'unit',
 u'locating',
 u'system',
 u'filed',
 u'feb',
 u'15',
 u'1969',
 u'2',
 u'sheets',
 u

In [96]:
ans = pd.DataFrame({
    'tag':data.tag[test_similarity[0].argsort()[-3:][::-1]] , 
    'cosine':  test_similarity[0][test_similarity[0].argsort()[-3:][::-1]]
})

In [99]:
data.tag[test_similarity[0].argsort()[-8:][::-1]] 

80         lcd
87         lcd
58      mobile
37    medicine
88         lcd
53      mobile
71      mobile
36    medicine
Name: tag, dtype: object

In [92]:
ans

Unnamed: 0,cosine,tag
80,0.35181,
87,0.344635,
58,0.341304,


In [101]:
test['new_text'] = test.text.apply(review_to_sentences)

In [103]:
test.new_text[0]

[u'medicine',
 u'supply',
 u'apparatus',
 u'abstract',
 u'disclosed',
 u'medicine',
 u'supply',
 u'apparatus',
 u'improve',
 u'dispensing',
 u'operation',
 u'efficiency',
 u'medicine',
 u'supply',
 u'apparatus',
 u'constituted',
 u'storing',
 u'plurality',
 u'tablet',
 u'cases',
 u'main',
 u'body',
 u'case',
 u'storage',
 u'part',
 u'tablet',
 u'case',
 u'comprising',
 u'container',
 u'containing',
 u'medicine',
 u'discharge',
 u'drum',
 u'discharging',
 u'medicine',
 u'container',
 u'comprises',
 u'motor',
 u'driving',
 u'discharge',
 u'drum',
 u'controller',
 u'controlling',
 u'operation',
 u'motor',
 u'controller',
 u'comprises',
 u'means',
 u'rotating',
 u'forward',
 u'motor',
 u'discharge',
 u'medicine',
 u'detecting',
 u'medicine',
 u'sticking',
 u'medicine',
 u'sticking',
 u'occurs',
 u'motor',
 u'rotated',
 u'backward',
 u'rotated',
 u'forward',
 u'description',
 u'technical',
 u'field',
 u'present',
 u'invention',
 u'relates',
 u'medicine',
 u'supply',
 u'apparatus',
 u'suppli

##### Word2Vec Cosine Similarity

In [None]:
w2v_model = Word2Vec(data['new_text'],  min_count=1, window=10, workers=2)

In [None]:
w2v_model.

In [None]:
import gensim
# train word2vec on the two sentences
model = gensim.models.Word2Vec(data.new_text)

In [None]:
data['new_text'] = data.text.apply(review_to_sentences)

In [None]:
model.similarity('ISA\*', '276')

In [None]:
def avg_feature_vector(words, model, num_features):
        #function to average all words vectors in a given paragraph
        featureVec = np.zeros((num_features,), dtype="float32")
        nwords = 0

        #list containing names of words in the vocabulary
        index2word_set = set(model.index2word) 
        for word in words:
            if word in index2word_set:
                nwords = nwords+1
                featureVec = np.add(featureVec, model[word])

        if(nwords>0):
            featureVec = np.divide(featureVec, nwords)
        return featureVec

In [None]:
#get average vector for sentence 1
sentence_1 = data.new_text[0]
sentence_1_avg_vector = avg_feature_vector(sentence_1, model=model, num_features=3000)

#get average vector for sentence 2
sentence_2 = test.new_text[0]
sentence_2_avg_vector = avg_feature_vector(sentence_2, model=model, num_features=3000)

sen1_sen2_similarity =  1 - spatial.distance.cosine(sentence_1_avg_vector,sentence_2_avg_vector)

In [None]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

In [None]:


from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"


In [None]:
from sklearn.preprocessing import LabelEncoder
labels = LabelEncoder()
y = labels.fit_transform(data.tag)
print(y.shape)

In [None]:
doc2vec_model = gensim.models.doc2vec.Doc2Vec(min_count=1, window=10, size=5, sample=1e-4, negative=5, workers=cores)

In [None]:
doc2vec_model.build_vocab(train_corpus)

In [None]:
doc2vec_model.train(train_corpus)

In [None]:
doc2vec_model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])


In [None]:
test_corpus = []
for i , line in enumerate(test.text):
    test_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), test.tag[i]))

In [None]:
len_doc_vec = len(doc2vec_model.docvecs)
for doc_id in range(len(train_corpus)):
    inferred_vector = doc2vec_model.infer_vector(train_corpus[i].words)
    sims = doc2vec_model.docvecs.most_similar([inferred_vector], topn=len_doc_vec)
    print(sims)

In [None]:
inferred_vector = doc2vec_model.infer_vector(train_corpus[0].words)
doc2vec_model.docvecs.most_similar([inferred_vector], topn=len_doc_vec)

In [None]:
doc2vec_model.docvecs.count

s = "<one-one> two 3.4 5,6 seven.eight nine,ten"
s = '<guestbook><guest><fname>Terje</fname><lname>Beck</lname></guest><guest><fname>Jan</fname><lname>Refsnes</lname></guest><guest><fname>Torleif</fname><lname>Rasmussen</lname></guest>'
parts = re.split('\s|(?<!\d)[,.](?!\d)', s)
parts

s = '<guestbook><guest><fname>Terje Prarrot</fname><lname>Beck</lname></guest><guest><fname>2.54</fname><lname>Ref58j_jjk .vfd.bdfsnes</lname></guest><guest><fname>Torleif</fname><lname>Rasmussen</lname></guest>'
s = re.sub('<',' <', s)
s = re.sub('>','> ', s)
re.split(' |\n',s)

import re

token_pattern = r"""
(?P<identifier>[a-zA-Z_][a-zA-Z0-9_]*)
|(?P<integer>[0-9]+)
|(?P<dot>\.)
|(?P<open_variable>[$][{])
|(?P<open_curly>[{])
|(?P<close_curly>[}])
|(?P<newline>\n)
|(?P<whitespace>\s+)
|(?P<equals>[=])
|(?P<slash>[/])
"""

token_re = re.compile(token_pattern, re.VERBOSE)

class TokenizerException(Exception): pass

def tokenize(text):
    pos = 0
    while True:
        m = token_re.match(text, pos)
        if not m: break
        pos = m.end()
        tokname = m.lastgroup
        tokvalue = m.group(tokname)
        yield tokname, tokvalue
    if pos != len(text):
        raise TokenizerException('tokenizer stopped at pos %r of %r' % (
            pos, len(text)))