# CSI 4107, Winter 2021
## Assignment 1 - Microblog information retrieval system

### Step 1: Preprocessing

In [43]:
import pandas as pd
import numpy as np
import math
import pandas_read_xml as pdx
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise

In [44]:
queriesDf = pdx.read_xml("queries.xml", ['queries', 'top'])
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432


In [45]:
df=pd.read_excel("messages.xls", 
                  header=None, usecols=[0, 1], names=['ID', 'text'])
df.head()

Unnamed: 0,ID,text
0,34952194402811904,Save BBC World Service from Savage Cuts http:/...
1,34952186328784896,a lot of people always make fun about the end ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...
4,34952008683229185,Latest:: Top World Releases http://globalclass...


In [46]:
stop = stopwords.words('english')

In [47]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
df['text_without_stopwords']=df['text_without_stopwords'].apply(str.lower)

queriesDf['text_without_stopwords'] = queriesDf['title'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].apply(str.lower)

  df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
  queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')


In [48]:
queriesDf['topic_id'] = queriesDf['num'].apply(lambda x : x.split(' '))
queriesDf['topic_id'] = queriesDf['topic_id'].apply(lambda x: x[1])

In [49]:
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime,text_without_stopwords,topic_id
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904,bbc world service staff cuts,MB001
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040,fifa soccer,MB002
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665,haiti aristide return,MB003
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448,mexico drug war,MB004
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432,nist computer security,MB005


In [50]:
df.head()

Unnamed: 0,ID,text,text_without_stopwords
0,34952194402811904,Save BBC World Service from Savage Cuts http:/...,save bbc world service savage cuts http://www....
1,34952186328784896,a lot of people always make fun about the end ...,lot people always make fun end world question ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...,rethink group positive outlook: technology sta...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...,'zombie' fund manager phoenix appoints new ceo...
4,34952008683229185,Latest:: Top World Releases http://globalclass...,latest:: top world releases http://globalclass...


In [51]:
tokenizer = RegexpTokenizer(r'\w+')
df['token'] = df['text_without_stopwords'].apply(tokenizer.tokenize)
queriesDf['token'] = queriesDf['text_without_stopwords'].apply(tokenizer.tokenize)

In [52]:
# detokenize for easier query/document vectorization
df['detoken']=df['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))
queriesDf['detoken']=queriesDf['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

In [53]:
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime,text_without_stopwords,topic_id,token,detoken
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904,bbc world service staff cuts,MB001,"[bbc, world, service, staff, cuts]",bbc world service staff cuts
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040,fifa soccer,MB002,"[fifa, soccer]",fifa soccer
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665,haiti aristide return,MB003,"[haiti, aristide, return]",haiti aristide return
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448,mexico drug war,MB004,"[mexico, drug, war]",mexico drug war
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432,nist computer security,MB005,"[nist, computer, security]",nist computer security


In [54]:
df.head()


Unnamed: 0,ID,text,text_without_stopwords,token,detoken
0,34952194402811904,Save BBC World Service from Savage Cuts http:/...,save bbc world service savage cuts http://www....,"[save, bbc, world, service, savage, cuts, http...",save bbc world service savage cuts http www pe...
1,34952186328784896,a lot of people always make fun about the end ...,lot people always make fun end world question ...,"[lot, people, always, make, fun, end, world, q...",lot people always make fun end world question ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...,rethink group positive outlook: technology sta...,"[rethink, group, positive, outlook, technology...",rethink group positive outlook technology staf...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...,'zombie' fund manager phoenix appoints new ceo...,"[zombie, fund, manager, phoenix, appoints, new...",zombie fund manager phoenix appoints new ceo p...
4,34952008683229185,Latest:: Top World Releases http://globalclass...,latest:: top world releases http://globalclass...,"[latest, top, world, releases, http, globalcla...",latest top world releases http globalclassifie...


In [55]:
#test to see if numbers have been removed
df['token'][26]


['bbc',
 'news',
 'charles',
 'taylor',
 'lawyer',
 'storms',
 'war',
 'crimes',
 'trial',
 'http',
 'www',
 'bbc',
 'co',
 'uk',
 'news',
 'world',
 'africa']

In [56]:
df['token'][404]

['come',
 'have',
 'heart',
 'save',
 'our',
 'homes',
 'no',
 'cuts',
 'to',
 'hud',
 'monday',
 'february',
 'pm',
 'pm',
 'help',
 'save',
 'http',
 'fb',
 'me',
 'kdjvjr']

In [57]:
df['token'][28]


['yay', 'one', 'favorite', 'episodes', 'boy', 'meets', 'world']

In [58]:
df['token'][22452]

['poll',
 'finds',
 'oppose',
 'new',
 'airport',
 'security',
 'measures',
 'money',
 'company',
 'los',
 'angeles',
 'times',
 'http',
 'latimesblogs',
 'http',
 'pi',
 'pe',
 'rwkc']

### Step 2: Indexing

In [59]:


# key, [(doc, count)]
# key [doc]


def create_index (data):
        index = defaultdict(list)
        count=0
        for i, tokens in enumerate(data):
            for token in tokens:
                index[token].append(i)
                #index[save].append(doc)
        return index



def count_occurrences(arr):
    return list(Counter(arr).items())

#    for d in dictionary:
#        return list(Counter(dictionary[d]).items())

def final_index(index):
    final=defaultdict(list)
    for key in index.keys():
        final[key] = count_occurrences(index[key])
    return final

# each token (save, bbc) as your dict key
# append the count_occurrences based on index
#final_index creates the final hash table for inverted index. {word: [(doc#, number of occurrences in that document)]}
# doc# in this case is the line number of the table, not the message ID.
    
    

In [60]:
test=create_index(df['token'])
test2=count_occurrences([0, 1, 44, 44, 443])
#test2
test3=final_index(test)
#test3

In [101]:
print(list(test3.keys())[0:100]) # get a sample of 100 tokens from vocab

51065
['save', 'bbc', 'world', 'service', 'savage', 'cuts', 'http', 'www', 'petitionbuzz', 'com', 'petitions', 'savews', 'lot', 'people', 'always', 'make', 'fun', 'end', 'question', 'is', 'are', 'u', 'ready', 'for', 'it', 'rethink', 'group', 'positive', 'outlook', 'technology', 'staffing', 'specialist', 'expects', 'revenues', 'marg', 'bit', 'ly', 'hfjtmy', 'zombie', 'fund', 'manager', 'phoenix', 'appoints', 'new', 'ceo', 'buys', 'funds', 'closed', 'business', 'dxrlh', 'latest', 'top', 'releases', 'globalclassified', 'net', 'cdt', 'presents', 'alice', 'in', 'wonderland', 'catonsville', 'dinner', 'posted', 'the', 'fb', 'me', 'gmicayt', 'territory', 'location', 'calgary', 'alberta', 'canada', 'job', 'category', 'bu', 'eomt', 'jobs', 'news', 'today', 'free', 'school', 'funding', 'plans', 'lack', 'transparency', 'co', 'uk', 'hi', 'newsid_', 'stm', 'manchester', 'city', 'council', 'details', 'saving', 'plan', 'fypypc', 'depressing', 'apparently', 'we']


### Step 3: Retrieval and Ranking
#### 3.1. Vectorization of the corpus and extraction of the vocabulary

In [61]:
# creating tf-idfs matrix
corpus = df['detoken']


# use X[doc_id, term_id] to get the td-idf for the term and document.
docVectorizer = TfidfVectorizer()
docMatrix = docVectorizer.fit_transform(corpus)

# getting the vocabulary (set of terms in the corpus)
docVocabulary = docVectorizer.vocabulary_
# use vocabulary.get('some_term') which would return None if the word doesn't exist

docMatrixArr = docMatrix.toarray()

# print(vocabulary)
#print(df['detoken'].shape)



In [108]:
print(len(docVocabulary)) # size of vocab

50953


#### 3.2. Function to vectorize a query using the document vocabulary and queries vectorization

The function below returns an array of len = # queries. Each element in the array has a len = # of terms in the corpus and tf-idf values for each term in the query only terms that are both in the query and the corpus will have tf-idf values since the vocabulary of the corpus is being used for the vectorization of the query. In the assignment, we have a total of 50953 terms that will be used as part of the vocabulary. 

In [62]:
# functions for query processing
def transform_query (q, voc):
    queryVectorizer = TfidfVectorizer(vocabulary = voc)
    qMatrix = queryVectorizer.fit_transform(q)
    qMatrixArr = qMatrix.toarray()
    return qMatrixArr[0]

#sample_query = 'BBC World Service staff cuts'
#sample_query_transformation = transform_query([sample_query], docVocabulary)
#print(len(sample_query_transformation)) #should return 50953 which is the total number of terms

In [63]:
# queries vectorization
#queriesDf['title'][0]

queriesVectArr = []

for i in range (len(queriesDf['title'])):
    currQuery = queriesDf['title'][i]
    currQueryVect = transform_query([currQuery], docVocabulary)
    queriesVectArr.append(currQueryVect)


In [64]:
# print(len(queriesVectArr[0])) # this should return 50953

#### 3.3. Calculating similarity 

Similarity between each query and the list of documents 
The results will be an array of 49 array, each array will be of len = # documents.

In [65]:
testQueriesMatrix = transform_query([queriesDf['title'][0]], docVocabulary)
similarityMatrix = pairwise.cosine_similarity(queriesVectArr, docMatrixArr)

In [66]:
# len(similarityMatrix[0])
# similarityMatrix[0][0]
# type(similarityMatrix)  # numpy.ndarray

#### 3.4. Ranking results
Building an 2D - array containing the top 1000 documents for each query

In [67]:
def get_top_results(simArr):
    top_1000 = [] # array of doc ids of top 1000 results
    simArr2D = [] # creating a 2d array of type [[doc_id, similarity value],...]
    
    for i in range(len(simArr)):
        simArr2D.append([i + 1, simArr[i]])
        
    # print("simArr2D")
    # print(simArr2D)
    
    simArr2DSorted = sorted(simArr2D, key = lambda x : x[1], reverse = True)
    
    #print("simArr2D sorted")
    #print(simArr2DSorted)
    
    for i in range(1000):
        top_1000.append(simArr2DSorted[i])
    
    return top_1000

In [68]:
# test
# print(get_top_results(similarityMatrix[0]))

In [69]:
resultsMatrix = []
for i in range(len(similarityMatrix)):
    currQueryMatrix = similarityMatrix[i]
    currQueryTopResults = get_top_results(currQueryMatrix)
    resultsMatrix.append(currQueryTopResults)

In [70]:
# test
# print(len(resultsMatrix[47]))
# print(resultsMatrix[0])
# print(resultsMatrix[3])
# print(resultsMatrix[7])
# print(resultsMatrix[43])
# print(len(resultsMatrix))
# print(len(resultsMatrix[0]))
# print(len(resultsMatrix[0][0]))

### 4 Writing results to file


In [71]:
# sample line: topic_id Q0 docno rank score tag
# topic_id: queriesDf['topic_id'][i]
# docno: df['ID'][resultsMatrix[i][j][0]]
# rank: j from resultsMatrix[i], +1 because highest rank is 1 not 0
# score: resultsMatrix[i][j][1]
# tag: passed in at function call

    
def write_results(tag, resultMatrix):
    
    f = open("results.txt", "w+")
    
    for i in range(len(resultMatrix)):
        topic_id = queriesDf['topic_id'][i][2:].lstrip('0') #[2:].lstrip('0') to strip "MB" and leading 0s to end up with plain number
        for j in range(len(resultMatrix[i])):
            docno = df['ID'][resultsMatrix[i][j][0]]
            rank = j +1
            score = resultsMatrix[i][j][1]
            new_line = topic_id + " Q0 " + str(docno) + " " + str(rank) + " " + str(score) + " " + tag + "\n"
            f.write(new_line)
        
    f.close()
    

In [72]:
# test
write_results("hello", resultsMatrix)
