# CSI 4107, Winter 2021
## Assignment 1 - Microblog information retrieval system

### Step 1: Preprocessing

In [3]:
import pandas as pd
import numpy as np
import math
import pandas_read_xml as pdx
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise
from scipy import spatial

In [4]:
queriesDf = pdx.read_xml("../queries.xml", ['queries', 'top'])
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432


In [5]:
#read tweets and transform into dataframe
data = []
with open('../Trec_microblog11.txt', 'r', encoding='utf-8', errors='replace') as infile:
    lines = infile.readlines()
    for i in lines:
        data.append(i.split('\t'))
        
data = np.array(data) #2d numpy array
        
df = pd.DataFrame({'ID': data[:, 0], 'text': data[:, 1]})
df.head()

Unnamed: 0,ID,text
0,﻿34952194402811904,Save BBC World Service from Savage Cuts http:/...
1,34952186328784896,a lot of people always make fun about the end ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...
4,34952008683229185,Latest:: Top World Releases http://globalclass...


In [6]:
stop = stopwords.words('english')

In [7]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
df['text_without_stopwords']=df['text_without_stopwords'].apply(str.lower)

queriesDf['text_without_stopwords'] = queriesDf['title'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].apply(str.lower)

  df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
  queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')


In [8]:
queriesDf['topic_id'] = queriesDf['num'].apply(lambda x : x.split(' '))
queriesDf['topic_id'] = queriesDf['topic_id'].apply(lambda x: x[1])

In [9]:
tokenizer = RegexpTokenizer(r'\w+')
pstemmer = PorterStemmer()

#tokenize and stem a string
#not used cuz got worse results with stemmed words
def tokenizer_stemmer(str):
    tokens = tokenizer.tokenize(str)
    for i in range(len(tokens)):
        tokens[i] = pstemmer.stem(tokens[i])
    return tokens

#tokenize dataframes
df['token'] = df['text_without_stopwords'].apply(lambda x: tokenizer.tokenize(x))
queriesDf['token'] = queriesDf['text_without_stopwords'].apply(lambda x: tokenizer.tokenize(x))

In [10]:
# detokenize for easier query/document vectorization
df['detoken']=df['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))
queriesDf['detoken']=queriesDf['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

In [11]:
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime,text_without_stopwords,topic_id,token,detoken
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904,bbc world service staff cuts,MB001,"[bbc, world, service, staff, cuts]",bbc world service staff cuts
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040,fifa soccer,MB002,"[fifa, soccer]",fifa soccer
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665,haiti aristide return,MB003,"[haiti, aristide, return]",haiti aristide return
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448,mexico drug war,MB004,"[mexico, drug, war]",mexico drug war
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432,nist computer security,MB005,"[nist, computer, security]",nist computer security


In [12]:
df.head()


Unnamed: 0,ID,text,text_without_stopwords,token,detoken
0,﻿34952194402811904,Save BBC World Service from Savage Cuts http:/...,save bbc world service savage cuts http://www....,"[save, bbc, world, service, savage, cuts, http...",save bbc world service savage cuts http www pe...
1,34952186328784896,a lot of people always make fun about the end ...,lot people always make fun end world question ...,"[lot, people, always, make, fun, end, world, q...",lot people always make fun end world question ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...,rethink group positive outlook: technology sta...,"[rethink, group, positive, outlook, technology...",rethink group positive outlook technology staf...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...,'zombie' fund manager phoenix appoints new ceo...,"[zombie, fund, manager, phoenix, appoints, new...",zombie fund manager phoenix appoints new ceo p...
4,34952008683229185,Latest:: Top World Releases http://globalclass...,latest:: top world releases http://globalclass...,"[latest, top, world, releases, http, globalcla...",latest top world releases http globalclassifie...


In [13]:

#df['token'][26]#test to see if numbers have been removed
#print(df['token'][12898])

### Step 2: Indexing

In [14]:


# key [doc]
# reverse index
def create_index (data):
        index = defaultdict(list)
        count=0
        for i, tokens in enumerate(data):
            for token in tokens:
                index[token].append(i)
                #index[save].append(doc)
        return index



def count_occurrences(arr):
    return list(Counter(arr).items())



# key, [(doc#, count)]
def final_index(index):
    final=defaultdict(list)
    for key in index.keys():
        final[key] = count_occurrences(index[key])
    return final

# each token (save, bbc) as your dict key
# append the count_occurrences based on index
#final_index creates the final hash table for inverted index. {word: [(doc#, number of occurrences in that document)]}
# doc# in this case is the line number of the table, not the message ID.
    
    

In [15]:
test=create_index(df['token'])
test2=count_occurrences([0, 1, 44, 44, 443])
rev_index=final_index(test)

#print(len(rev_index.keys()))

In [16]:
print(list(rev_index.keys())[0:100]) # get a sample of 100 tokens from vocab

['save', 'bbc', 'world', 'service', 'savage', 'cuts', 'http', 'www', 'petitionbuzz', 'com', 'petitions', 'savews', 'lot', 'people', 'always', 'make', 'fun', 'end', 'question', 'is', 'are', 'u', 'ready', 'for', 'it', 'rethink', 'group', 'positive', 'outlook', 'technology', 'staffing', 'specialist', 'expects', 'revenues', 'marg', 'bit', 'ly', 'hfjtmy', 'zombie', 'fund', 'manager', 'phoenix', 'appoints', 'new', 'ceo', 'buys', 'funds', 'closed', 'business', 'dxrlh', 'latest', 'top', 'releases', 'globalclassified', 'net', 'cdt', 'presents', 'alice', 'in', 'wonderland', 'catonsville', 'dinner', 'posted', 'the', 'fb', 'me', 'gmicayt', 'territory', 'location', 'calgary', 'alberta', 'canada', 'job', 'category', 'bu', 'eomt', 'jobs', 'news', 'today', 'free', 'school', 'funding', 'plans', 'lack', 'transparency', 'co', 'uk', 'hi', 'newsid_', 'stm', 'manchester', 'city', 'council', 'details', 'saving', 'plan', 'fypypc', 'depressing', 'apparently', 'we']


## Step 3: Retrieval and Ranking

#### Initializing docvectorizer, and doc matrix

In [17]:
# query: list of tokens for a query
# docs: inverted hash index with key:value of token:[(doc_num,num_occurances,...)]
def related_documents_to_query(query, docs):
    related_docs = []
    for token in query:
        if token in docs:
            related_docs+=docs[token]

    nums_only = []
    for x in related_docs:
        nums_only.append(x[0])

    return set(nums_only)


#get vocabulary
vocab= []
for arr in df['token']:
    vocab += arr
vocab = set(vocab)

#create td-idf matrix
corpus = df['detoken']
docVectorizer = TfidfVectorizer(vocabulary = vocab, sublinear_tf=True)
docMatrix = docVectorizer.fit_transform(corpus)
docVocabulary = docVectorizer.vocabulary_
#docMatrix.toarray()


#find cosine similarity
#returns an array of (doc_num, similarity) tuples
def similarity_cosine(query_vector, docs):
    sim_arr = []
    
    for doc in docs: 
        similarity = pairwise.cosine_similarity(doc[1], query_vector)
        sim_arr.append((doc[0], similarity[0][0]))
    return sim_arr 


print(len(docVocabulary)) 
    

87685


#### Expanding queries with Pre trained neural model words embeddings

In [18]:
##Query expansion using trained neural model

# followed tutorial from https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db
embeddings_dict = {}
# load word vectors
# pre trained twitter word vectors downloaded from http://nlp.stanford.edu/data/glove.twitter.27B.zip
with open("glovetwitter27B/glove.twitter.27B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [19]:

# return list of most similar words and eucladean distancec to a given word embedding vector
def find_closest_embeddings(embedding):
    distances = []
    for word in embeddings_dict.keys():
        try:
            dist = spatial.distance.euclidean(embeddings_dict[word], embedding)
            distances.append( (word, dist) )
        except Exception as e:
            continue
            #print(word)
    return sorted(distances, key=lambda tup: tup[1])
    #return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

# return expanded list containing similar words given  list of tokens
def expand_query_tokens( q_tokens ):
    expanded = q_tokens[:]
    for token in q_tokens:
        if token in embeddings_dict:
            closest = find_closest_embeddings(embeddings_dict[token])[1:4] # get 3 closest words
            expanded+= [tup[0] for tup in closest]
    return expanded

#print(expand_query_tokens(['hello','King'])) #King not in dict, expected ['hello', 'King', 'welcome', 'hey', 'dear']

In [20]:
f = open("expanded_queries.txt", "w+", encoding="utf-8") # to write expanded queries to file so as not to have to calculate again

#get an array containing the expanded queries
expanded_queries_arr = []
for index, q_row in queriesDf.iterrows():
    #expand the query vector
    expanded_query = expand_query_tokens( q_row['token'] )
    expanded_query = " ".join(str(x) for x in expanded_query) #detokenize
    expanded_queries_arr.append(expanded_query)
    f.write(expanded_query+'\n')  

f.close()

[('cnn', 2.745408773422241), ('headline', 2.880249261856079), ('nbc', 2.8921194076538086)]
[('earth', 2.754417896270752), ('dream', 2.7935903072357178), ('one', 2.8911826610565186)]
[('services', 2.6111581325531006), ('office', 2.843423366546631), ('client', 2.888474225997925)]
[('department', 2.9020767211914062), ('office', 2.931246280670166), ('group', 3.0342190265655518)]
[('raises', 3.110835075378418), ('costs', 3.321230411529541), ('cut', 3.38932204246521)]
35108223539224576	www.crack4djs.net/thegoodfellas New releases coming out on Thursday!! Currently ranked #1... Go check it out!!
 ['bbc', 'world', 'service', 'staff', 'cuts', 'cnn', 'headline', 'nbc', 'earth', 'dream', 'one', 'services', 'office', 'client', 'department', 'office', 'group', 'raises', 'costs', 'cut']
[('sport', 3.3742122650146484), ('league', 3.3807621002197266), ('xbox', 3.4306488037109375)]
[('football', 2.1471357345581055), ('volleyball', 2.4756181240081787), ('softball', 2.520517110824585)]
[('tanzania', 2.75

[('superbowl', 2.8949451446533203), ('buffalo', 3.615056037902832), ('halftime', 3.6355602741241455)]
[('seat', 3.1654200553894043), ('row', 3.3320138454437256), ('rooms', 3.7179458141326904)]
[('ped', 3.217222213745117), ('coe', 3.262561082839966), ('ems', 3.2670984268188477)]
[('incheon', 3.7860827445983887), ('bangkok', 3.8627419471740723), ('haneda', 3.8904106616973877)]
[('screenings', 2.920501232147217), ('airs', 3.1761252880096436), ('exhibition', 3.263402223587036)]
[('bring', 2.7794084548950195), ('our', 2.7977209091186523), ('will', 2.8255105018615723)]
[('inflation', 2.9625184535980225), ('debt', 2.9788923263549805), ('jobless', 3.1102092266082764)]
[('consume', 3.232283115386963), ('produce', 3.292715549468994), ('diabetes', 3.577742338180542)]
[('fuel', 3.0813217163085938), ('boost', 3.25868558883667), ('supply', 3.3655455112457275)]
[('increases', 3.4026997089385986), ('usage', 3.5274658203125), ('reducing', 3.6390697956085205)]
[('chicago', 1.5348154306411743), ('memphis

#### Getting the Ranked Results

In [21]:
def rank_by_similarity(query_str, related_docs_matrix):
    # get similarities
    qvec = docVectorizer.transform([query_str]) # vectorize query
    sim_arr = similarity_cosine(qvec, related_docs_matrix)
    
    #sort by similarity
    ranked = sorted(sim_arr, key = lambda x : x[1], reverse = True)
    return ranked
    
# final results: array of length # of queries with tuples of (query, top_1000_docs)
# where top_1000_docs is an array of tuples of (doc_num, similarity)
final_results = []
for index, q_row in queriesDf.iterrows():
    # for each query, get set of related docs (doc #, num of occurences)
    related_docs = related_documents_to_query(q_row['token'], rev_index)
    
    related_docs_matrix = []
    #get vectors of related docs
    for doc_num in related_docs:
        related_docs_matrix.append((doc_num, docMatrix[doc_num]) )
    
    # get ranked related docs
    ranked = rank_by_similarity(q_row['detoken'],related_docs_matrix )   
    # rank top 1000
    top_1000 = ranked[0:1000]
    
    #rerank top 1000 after expanded query vectors
    related_docs_matrix_1000 = []
    docnums_1000 = [tup[0] for tup in top_1000]
    for doc_num in docnums_1000:
        related_docs_matrix_1000.append((doc_num, docMatrix[doc_num]) )
        
    reranked_1000 = rank_by_similarity(expanded_queries_arr[index] ,related_docs_matrix_1000 )
    
    # add to final results array
    final_results.append( (q_row['num'], reranked_1000) )

## Step 4: Write to file

In [23]:
# sample line: topic_id Q0 docno rank score tag
# topic_id: queriesDf['topic_id'][i]
# docno: df['ID'][resultsMatrix[i][j][0]]
# rank: j from resultsMatrix[i], +1 because highest rank is 1 not 0
# score: resultsMatrix[i][j][1]
# tag: passed in at function call

def write_results(tag, resultMatrix):
    
    f = open("results.txt", "w+", encoding="utf-8")
    
    for i in range(len(resultMatrix)):
        topic_id = queriesDf['topic_id'][i][2:].lstrip('0') #[2:].lstrip('0') to strip "MB" and leading 0s to end up with plain number

        for j in range(len(resultMatrix[i][1])): #array of 1000 docs in tuples (doc_row, similarity)
            
            docno = df['ID'][resultMatrix[i][1][j][0]]
            rank = j +1
            score = resultMatrix[i][1][j][1]
            new_line = topic_id + " Q0 " + str(docno) + " " + str(rank) + " " + str(score) + " " + tag + "\n"
            f.write(new_line)
        
    f.close()
    

In [24]:
write_results('STANDARD',final_results)