# CSI 4107, Winter 2021
## Assignment 1 - Microblog information retrieval system

### Step 1: Preprocessing

In [1]:
import pandas as pd
import numpy as np
import math
import pandas_read_xml as pdx
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk import word_tokenize
import numpy

In [2]:
queriesDf = pdx.read_xml("../queries.xml", ['queries', 'top'])
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432


In [3]:
#read tweets and transform into dataframe
data = []
with open('../Trec_microblog11.txt', 'r', encoding='utf-8', errors='replace') as infile:
    lines = infile.readlines()
    for i in lines:
        data.append(i.split('\t'))
        
data = np.array(data) #2d numpy array
        
df = pd.DataFrame({'ID': data[:, 0], 'text': data[:, 1]})
df.head()

Unnamed: 0,ID,text
0,﻿34952194402811904,Save BBC World Service from Savage Cuts http:/...
1,34952186328784896,a lot of people always make fun about the end ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...
4,34952008683229185,Latest:: Top World Releases http://globalclass...


In [4]:
stop = stopwords.words('english')

In [5]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
df['text_without_stopwords']=df['text_without_stopwords'].apply(str.lower)

queriesDf['text_without_stopwords'] = queriesDf['title'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].apply(str.lower)

  df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
  queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')


In [6]:
queriesDf['topic_id'] = queriesDf['num'].apply(lambda x : x.split(' '))
queriesDf['topic_id'] = queriesDf['topic_id'].apply(lambda x: x[1])

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
pstemmer = PorterStemmer()

#tokenize and stem a string
#not used cuz got worse results with stemmed words
def tokenizer_stemmer(str):
    tokens = tokenizer.tokenize(str)
    for i in range(len(tokens)):
        tokens[i] = pstemmer.stem(tokens[i])
    return tokens

#tokenize dataframes
df['token'] = df['text_without_stopwords'].apply(lambda x: tokenizer.tokenize(x))
queriesDf['token'] = queriesDf['text_without_stopwords'].apply(lambda x: tokenizer.tokenize(x))

In [8]:
# detokenize for easier query/document vectorization
df['detoken']=df['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))
queriesDf['detoken']=queriesDf['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

In [9]:
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime,text_without_stopwords,topic_id,token,detoken
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904,bbc world service staff cuts,MB001,"[bbc, world, service, staff, cuts]",bbc world service staff cuts
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040,fifa soccer,MB002,"[fifa, soccer]",fifa soccer
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665,haiti aristide return,MB003,"[haiti, aristide, return]",haiti aristide return
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448,mexico drug war,MB004,"[mexico, drug, war]",mexico drug war
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432,nist computer security,MB005,"[nist, computer, security]",nist computer security


In [10]:
df.head()


Unnamed: 0,ID,text,text_without_stopwords,token,detoken
0,﻿34952194402811904,Save BBC World Service from Savage Cuts http:/...,save bbc world service savage cuts http://www....,"[save, bbc, world, service, savage, cuts, http...",save bbc world service savage cuts http www pe...
1,34952186328784896,a lot of people always make fun about the end ...,lot people always make fun end world question ...,"[lot, people, always, make, fun, end, world, q...",lot people always make fun end world question ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...,rethink group positive outlook: technology sta...,"[rethink, group, positive, outlook, technology...",rethink group positive outlook technology staf...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...,'zombie' fund manager phoenix appoints new ceo...,"[zombie, fund, manager, phoenix, appoints, new...",zombie fund manager phoenix appoints new ceo p...
4,34952008683229185,Latest:: Top World Releases http://globalclass...,latest:: top world releases http://globalclass...,"[latest, top, world, releases, http, globalcla...",latest top world releases http globalclassifie...


In [11]:

#df['token'][26]#test to see if numbers have been removed
#print(df['token'][12898])

### Step 2: Indexing

In [12]:


# key [doc]
# reverse index
def create_index (data):
        index = defaultdict(list)
        count=0
        for i, tokens in enumerate(data):
            for token in tokens:
                index[token].append(i)
                #index[save].append(doc)
        return index



def count_occurrences(arr):
    return list(Counter(arr).items())



# key, [(doc#, count)]
def final_index(index):
    final=defaultdict(list)
    for key in index.keys():
        final[key] = count_occurrences(index[key])
    return final

# each token (save, bbc) as your dict key
# append the count_occurrences based on index
#final_index creates the final hash table for inverted index. {word: [(doc#, number of occurrences in that document)]}
# doc# in this case is the line number of the table, not the message ID.
    
    

In [13]:
test=create_index(df['token'])
test2=count_occurrences([0, 1, 44, 44, 443])
rev_index=final_index(test)

#print(len(rev_index.keys()))

In [14]:
print(list(rev_index.keys())[0:100]) # get a sample of 100 tokens from vocab

['save', 'bbc', 'world', 'service', 'savage', 'cuts', 'http', 'www', 'petitionbuzz', 'com', 'petitions', 'savews', 'lot', 'people', 'always', 'make', 'fun', 'end', 'question', 'is', 'are', 'u', 'ready', 'for', 'it', 'rethink', 'group', 'positive', 'outlook', 'technology', 'staffing', 'specialist', 'expects', 'revenues', 'marg', 'bit', 'ly', 'hfjtmy', 'zombie', 'fund', 'manager', 'phoenix', 'appoints', 'new', 'ceo', 'buys', 'funds', 'closed', 'business', 'dxrlh', 'latest', 'top', 'releases', 'globalclassified', 'net', 'cdt', 'presents', 'alice', 'in', 'wonderland', 'catonsville', 'dinner', 'posted', 'the', 'fb', 'me', 'gmicayt', 'territory', 'location', 'calgary', 'alberta', 'canada', 'job', 'category', 'bu', 'eomt', 'jobs', 'news', 'today', 'free', 'school', 'funding', 'plans', 'lack', 'transparency', 'co', 'uk', 'hi', 'newsid_', 'stm', 'manchester', 'city', 'council', 'details', 'saving', 'plan', 'fypypc', 'depressing', 'apparently', 'we']


## Step 3: Retrieval and Ranking

In [15]:
# query: list of tokens for a query
# docs: inverted hash index with key:value of token:[(doc_num,num_occurances,...)]
def related_documents_to_query(query, docs):
    related_docs = []
    for token in query:
        if token in docs:
            related_docs+=docs[token]

    nums_only = []
    for x in related_docs:
        nums_only.append(x[0])

    return set(nums_only)


#get vocabulary
vocab= []
for arr in df['token']:
    vocab += arr
vocab = set(vocab)

#create td-idf matrix
corpus = df['detoken']
docVectorizer = TfidfVectorizer(vocabulary = vocab, sublinear_tf=True)
docMatrix = docVectorizer.fit_transform(corpus)
docVocabulary = docVectorizer.vocabulary_
#docMatrix.toarray()


#find cosine similarity
#returns an array of (doc_num, similarity) tuples
def similarity_cosine(query_vector, docs):
    sim_arr = []
    
    for doc in docs: 
        similarity = pairwise.cosine_similarity(doc[1], query_vector)
        sim_arr.append((doc[0], similarity[0][0]))
    return sim_arr 


print(len(docVocabulary)) 
    

87685


In [16]:

# final results: array of length # of queries with tuples of (query, top_1000_docs)
# where top_1000_docs is an array of tuples of (doc_num, similarity)
final_results = []

for index, q_row in queriesDf.iterrows():
    # for each query, get set of related docs (doc #, num of occurences)
    related_docs = related_documents_to_query(q_row['token'], rev_index)
    
    related_docs_matrix = []
    #get vectors of related docs
    for doc_num in related_docs:
        related_docs_matrix.append((doc_num, docMatrix[doc_num]) )

    # get similarities
    qvec = docVectorizer.transform([q_row['detoken']]) # vectorize query
        
    sim_arr = similarity_cosine(qvec, related_docs_matrix)
    
    #sort by similarity
    ranked = sorted(sim_arr, key = lambda x : x[1], reverse = True)
    
    # rank top 1000
    top_1000 = ranked[0:1000]
    
    # add to final results array
    final_results.append( (q_row['num'], top_1000) )
    


    
    

In [17]:
docresults=[] #1d list of doc nums only
doc_results_w_num = [] #list of lists containing docnums for each query result
for i in range(len(final_results)):
    doc_results_w_num.append([])
    for j in range(len(final_results[i][1])): #array of 1000 docs in tuples (doc_row, similarity)
        docno = df['ID'][final_results[i][1][j][0]]
        docresults.append(docno)
        doc_results_w_num[i].append(docno)
        

In [29]:
#using sent2vec code from https://pypi.org/project/sent2vec/
from sent2vec.vectorizer import Vectorizer

vectorizer = Vectorizer()

vectorizer.bert(queriesDf['detoken'])
vectors_tmp = vectorizer.vectors

print(vectors_tmp.shape)

(49, 768)


In [34]:
# convert queries to vectors and write to file
query_vecs_file = open("query_vecs.txt", "w+", encoding="utf-8") 
query_vecs_file.write("\n".join(" ".join(map(str, x)) for x in vectors_tmp))
query_vecs_file.close()

In [40]:
# RUN ONLY ONCE, takes a long time
# convert each document to a vector and write to file, do in batches of 25 docs to avoid using too much RAM
docs_vecs_file = open("doc_vecs.txt", "w+", encoding="utf-8") 
for i in range(0, len(df['detoken']), 25): #process all docs in  batches of 50
    vectors_tmp = []
    print(i)
    vectorizer.bert(df['detoken'][i:i+25])
    vectors_tmp = vectorizer.vectors
    #print(vectors_tmp.shape)
    docs_vecs_file.write("\n".join(" ".join(map(str, x)) for x in vectors_tmp))
    docs_vecs_file.write("\n")
docs_vecs_file.close()

0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650
675
700
725
750
775
800
825
850
875
900
925
950
975
1000
1025
1050
1075
1100
1125
1150
1175
1200
1225
1250
1275
1300
1325
1350
1375
1400
1425
1450
1475
1500
1525
1550
1575
1600
1625
1650
1675
1700
1725
1750
1775
1800
1825
1850
1875
1900
1925
1950
1975
2000
2025
2050
2075
2100
2125
2150
2175
2200
2225
2250
2275
2300
2325
2350
2375
2400
2425
2450
2475
2500
2525
2550
2575
2600
2625
2650
2675
2700
2725
2750
2775
2800
2825
2850
2875
2900
2925
2950
2975
3000
3025
3050
3075
3100
3125
3150
3175
3200
3225
3250
3275
3300
3325
3350
3375
3400
3425
3450
3475
3500
3525
3550
3575
3600
3625
3650
3675
3700
3725
3750
3775
3800
3825
3850
3875
3900
3925
3950
3975
4000
4025
4050
4075
4100
4125
4150
4175
4200
4225
4250
4275
4300
4325
4350
4375
4400
4425
4450
4475
4500
4525
4550
4575
4600
4625
4650
4675
4700
4725
4750
4775
4800
4825
4850
4875
4900
4925
4950
4975
5000
5025
5050
5075
5100
5125
5150
5175
5200


36000
36025
36050
36075
36100
36125
36150
36175
36200
36225
36250
36275
36300
36325
36350
36375
36400
36425
36450
36475
36500
36525
36550
36575
36600
36625
36650
36675
36700
36725
36750
36775
36800
36825
36850
36875
36900
36925
36950
36975
37000
37025
37050
37075
37100
37125
37150
37175
37200
37225
37250
37275
37300
37325
37350
37375
37400
37425
37450
37475
37500
37525
37550
37575
37600
37625
37650
37675
37700
37725
37750
37775
37800
37825
37850
37875
37900
37925
37950
37975
38000
38025
38050
38075
38100
38125
38150
38175
38200
38225
38250
38275
38300
38325
38350
38375
38400
38425
38450
38475
38500
38525
38550
38575
38600
38625
38650
38675
38700
38725
38750
38775
38800
38825
38850
38875
38900
38925
38950
38975
39000
39025
39050
39075
39100
39125
39150
39175
39200
39225
39250
39275
39300
39325
39350
39375
39400
39425
39450
39475
39500
39525
39550
39575
39600
39625
39650
39675
39700
39725
39750
39775
39800
39825
39850
39875
39900
39925
39950
39975
40000
40025
40050
40075
40100
40125
4015

In [42]:
#converts a str line of decimals, seperated by spaces, to array of decimals
def str_to_vec(line):
    line = line.rstrip().split(" ")
    return [float(x) for x in line]

# read query vectors from file
query_vectors = []
with open("query_vecs.txt", "r", encoding="utf-8") as f:
    query_vectors = [str_to_vec(line) for line in f]

# read doc vectors from file
doc_vectors = []
with open("doc_vecs.txt", "r", encoding="utf-8") as f:
    doc_vectors = [str_to_vec(line) for line in f]


#print(query_vectors[0])
print(len(query_vectors))
print(len(doc_vectors))

49
45899


In [55]:
final_results2 = []

for index, q_row in queriesDf.iterrows():
    # for each query, get top_1000 docs previously retreived
    related_docs =  doc_results_w_num[index]
    related_docs_matrix = []
    
    #get vectors of related docs
    for doc_num in related_docs:
        # get vector of each doc using new model
        #related_docs_tokens = related_docs_tokens.append((df.loc[df['ID']==doc_num]['detoken']).item())
        doc_index =  df.loc[df['ID']==doc_num]['detoken'].index
        doc_vec = doc_vectors[doc_index[0]]
        related_docs_matrix.append((doc_num, [doc_vec]))  
            
    # get query vector
    qvec = query_vectors[index]
    sim_arr = similarity_cosine([qvec], related_docs_matrix)
    
    #sort by similarity
    ranked = sorted(sim_arr, key = lambda x : x[1], reverse = True)
    
    # add to final results array
    final_results2.append( (q_row['num'], ranked) )
    

## Step 4: Write to file

In [53]:
# sample line: topic_id Q0 docno rank score tag
# topic_id: queriesDf['topic_id'][i]
# docno: df['ID'][resultsMatrix[i][j][0]]
# rank: j from resultsMatrix[i], +1 because highest rank is 1 not 0
# score: resultsMatrix[i][j][1]
# tag: passed in at function call

def write_results(tag, resultMatrix):
    
    f = open("results.txt", "w+", encoding="utf-8")
    
    for i in range(len(resultMatrix)):
        topic_id = queriesDf['topic_id'][i][2:].lstrip('0') #[2:].lstrip('0') to strip "MB" and leading 0s to end up with plain number

        for j in range(len(resultMatrix[i][1])): #array of 1000 docs in tuples (doc_row, similarity)
            
            docno = resultMatrix[i][1][j][0]
            rank = j +1
            score = resultMatrix[i][1][j][1]
            new_line = topic_id + " Q0 " + str(docno) + " " + str(rank) + " " + str(score) + " " + tag + "\n"
            f.write(new_line)
        
    f.close()
    

In [54]:
write_results('STANDARD',final_results2)
print('done')

done
