# CSI 4107, Winter 2021
## Assignment 1 - Microblog information retrieval system

### Step 1: Preprocessing

In [34]:
import pandas as pd
import numpy as np
import math
import pandas_read_xml as pdx
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise

In [35]:
queriesDf = pdx.read_xml("queries.xml", ['queries', 'top'])
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432


In [36]:
#read tweets and transform into dataframe
data = []
with open('Trec_microblog11.txt', 'r', encoding='utf-8', errors='replace') as infile:
    lines = infile.readlines()
    for i in lines:
        data.append(i.split('\t'))
        
data = np.array(data) #2d numpy array
        
df = pd.DataFrame({'ID': data[:, 0], 'text': data[:, 1]})
df.head()

Unnamed: 0,ID,text
0,﻿34952194402811904,Save BBC World Service from Savage Cuts http:/...
1,34952186328784896,a lot of people always make fun about the end ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...
4,34952008683229185,Latest:: Top World Releases http://globalclass...


In [37]:
stop = stopwords.words('english')

In [38]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
df['text_without_stopwords']=df['text_without_stopwords'].apply(str.lower)

queriesDf['text_without_stopwords'] = queriesDf['title'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')
queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].apply(str.lower)

  df['text_without_stopwords'] = df['text_without_stopwords'].str.replace('\d+', '')
  queriesDf['text_without_stopwords'] = queriesDf['text_without_stopwords'].str.replace('\d+', '')


In [39]:
queriesDf['topic_id'] = queriesDf['num'].apply(lambda x : x.split(' '))
queriesDf['topic_id'] = queriesDf['topic_id'].apply(lambda x: x[1])

In [40]:
#queriesDf.head()
#df.head()

In [41]:
tokenizer = RegexpTokenizer(r'\w+')
df['token'] = df['text_without_stopwords'].apply(tokenizer.tokenize)
queriesDf['token'] = queriesDf['text_without_stopwords'].apply(tokenizer.tokenize)

In [42]:
# detokenize for easier query/document vectorization
df['detoken']=df['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))
queriesDf['detoken']=queriesDf['token'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

In [43]:
queriesDf.head()

Unnamed: 0,num,title,querytime,querytweettime,text_without_stopwords,topic_id,token,detoken
0,Number: MB001,BBC World Service staff cuts,Tue Feb 08 12:30:27 +0000 2011,34952194402811904,bbc world service staff cuts,MB001,"[bbc, world, service, staff, cuts]",bbc world service staff cuts
1,Number: MB002,2022 FIFA soccer,Tue Feb 08 18:51:44 +0000 2011,35048150574039040,fifa soccer,MB002,"[fifa, soccer]",fifa soccer
2,Number: MB003,Haiti Aristide return,Tue Feb 08 21:32:13 +0000 2011,35088534306033665,haiti aristide return,MB003,"[haiti, aristide, return]",haiti aristide return
3,Number: MB004,Mexico drug war,Wed Feb 02 17:22:14 +0000 2011,32851298193768448,mexico drug war,MB004,"[mexico, drug, war]",mexico drug war
4,Number: MB005,NIST computer security,Fri Feb 04 17:44:09 +0000 2011,33581589627666432,nist computer security,MB005,"[nist, computer, security]",nist computer security


In [44]:
df.head()


Unnamed: 0,ID,text,text_without_stopwords,token,detoken
0,﻿34952194402811904,Save BBC World Service from Savage Cuts http:/...,save bbc world service savage cuts http://www....,"[save, bbc, world, service, savage, cuts, http...",save bbc world service savage cuts http www pe...
1,34952186328784896,a lot of people always make fun about the end ...,lot people always make fun end world question ...,"[lot, people, always, make, fun, end, world, q...",lot people always make fun end world question ...
2,34952041415581696,ReThink Group positive in outlook: Technology ...,rethink group positive outlook: technology sta...,"[rethink, group, positive, outlook, technology...",rethink group positive outlook technology staf...
3,34952018120409088,'Zombie' fund manager Phoenix appoints new CEO...,'zombie' fund manager phoenix appoints new ceo...,"[zombie, fund, manager, phoenix, appoints, new...",zombie fund manager phoenix appoints new ceo p...
4,34952008683229185,Latest:: Top World Releases http://globalclass...,latest:: top world releases http://globalclass...,"[latest, top, world, releases, http, globalcla...",latest top world releases http globalclassifie...


In [45]:

#df['token'][26]#test to see if numbers have been removed
print(df['token'][12898])

['australia', 's', 'greatest', 'olympian', 'ian', 'thorpe', 'expected', 'reportedly', 'announce', 'comeback', 'international', 'swim', 'http', 'tf', 'to', 'fmg']


### Step 2: Indexing

In [46]:


# key, [(doc, count)]
# key [doc]


def create_index (data):
        index = defaultdict(list)
        count=0
        for i, tokens in enumerate(data):
            for token in tokens:
                index[token].append(i)
                #index[save].append(doc)
        return index



def count_occurrences(arr):
    return list(Counter(arr).items())

#    for d in dictionary:
#        return list(Counter(dictionary[d]).items())

def final_index(index):
    final=defaultdict(list)
    for key in index.keys():
        final[key] = count_occurrences(index[key])
    return final

# each token (save, bbc) as your dict key
# append the count_occurrences based on index
#final_index creates the final hash table for inverted index. {word: [(doc#, number of occurrences in that document)]}
# doc# in this case is the line number of the table, not the message ID.
    
    

In [47]:
test=create_index(df['token'])

test2=count_occurrences([0, 1, 44, 44, 443])
#test2
test3=final_index(test)
#test3

print(len(test3.keys()))
# print(test["thorpe"])
# for row_num in test["thorpe"]:
#     print(df['detoken'][row_num])

87685


In [48]:
#print(list(test3.keys())[0:100]) # get a sample of 100 tokens from vocab

## Step 3: Retrieval and Ranking

In [49]:
# query: list of tokens for a query
# docs: inverted hash index with key:value of token:[(doc_num,num_occurances,...)]
def related_documents_to_query(query, docs):
    related_docs = []
    for token in query:
        if token in docs:
            related_docs+=docs[token]

    nums_only = []
    for x in related_docs:
        nums_only.append(x[0])

    return set(nums_only)
                    
    
##create td-idf matrix
# creating tf-idfs matrix
corpus = df['detoken']
# use X[doc_id, term_id] to get the td-idf for the term and document.
docVectorizer = TfidfVectorizer()
docMatrix = docVectorizer.fit_transform(corpus)
docVocabulary = docVectorizer.vocabulary_
#docMatrix.toarray()


#find cosine similarity
#returns an array of (doc_num, similarity)
def similarity_cosine(query, docs):
    sim_arr = []
    qvec = docVectorizer.transform([query])
    
    for doc in docs: 
        similarity = pairwise.cosine_similarity(doc[1], qvec)
        sim_arr.append((doc[0], similarity[0][0]))
    return sim_arr 


  
    

In [None]:
#print(docMatrix.toarray())

##for each query, get set of related docs (doc #, num of occurences)
final_results = []
for index, q_row in queriesDf.iterrows():
    
    related_docs = related_documents_to_query(q_row['token'], test3)
    
    related_docs_matrix = []
    #get vectors of related docs
    for doc_num in related_docs:
        related_docs_matrix.append((doc_num, docMatrix[doc_num]) )
        
    # get similarities
    sim_arr = similarity_cosine(q_row['detoken'], related_docs_matrix)
    ranked = sorted(sim_arr, key = lambda x : x[1], reverse = True)
    
    # rank top 1000
    top_1000 = ranked[0:1000]
    #print(top_1000)
    #print(df['detoken'][top_1000[0][0]])
    
    # add to final results array
    final_results.append( (q_row['num'], top_1000) )

## Step 4

In [50]:
# sample line: topic_id Q0 docno rank score tag
# topic_id: queriesDf['topic_id'][i]
# docno: df['ID'][resultsMatrix[i][j][0]]
# rank: j from resultsMatrix[i], +1 because highest rank is 1 not 0
# score: resultsMatrix[i][j][1]
# tag: passed in at function call

def write_results(tag, resultMatrix):
    
    f = open("results1.txt", "w+", encoding="utf-8")
    
    for i in range(len(resultMatrix)):
        topic_id = queriesDf['topic_id'][i][2:].lstrip('0') #[2:].lstrip('0') to strip "MB" and leading 0s to end up with plain number

        for j in range(len(resultMatrix[i][1])): #array of 1000 docs in tuples (doc_row, similarity)
            
            docno = df['ID'][resultMatrix[i][1][j][0]]
            rank = j +1
            score = resultMatrix[i][1][j][1]
            new_line = topic_id + " Q0 " + str(docno) + " " + str(rank) + " " + str(score) + " " + tag + "\n"
            f.write(new_line)
        
    f.close()
    

In [None]:
write_results('STANDARD',final_results)

In [None]:
return

### Step 3: Retrieval and Ranking
#### 3.1. Vectorization of the corpus and extraction of the vocabulary

#### 3.2. Function to vectorize a query using the document vocabulary and queries vectorization

The function below returns an array of len = # queries. Each element in the array has a len = # of terms in the corpus and tf-idf values for each term in the query only terms that are both in the query and the corpus will have tf-idf values since the vocabulary of the corpus is being used for the vectorization of the query. In the assignment, we have a total of 50953 terms that will be used as part of the vocabulary. 

In [None]:
# functions for query processing
def transform_query (q, voc):
    queryVectorizer = TfidfVectorizer(vocabulary = voc)
    qMatrix = queryVectorizer.fit_transform(q)
    qMatrixArr = qMatrix.toarray()
    return qMatrixArr[0]

#sample_query = 'BBC World Service staff cuts'
#sample_query_transformation = transform_query([sample_query], docVocabulary)
#print(len(sample_query_transformation)) #should return 50953 which is the total number of terms

In [None]:
# queries vectorization
#queriesDf['title'][0]

queriesVectArr = []

for i in range (len(queriesDf['title'])):
    currQuery = queriesDf['title'][i]
    currQueryVect = transform_query([currQuery], docVocabulary)
    queriesVectArr.append(currQueryVect)


In [None]:
# print(len(queriesVectArr[0])) # this should return 50953

In [None]:
testQueriesMatrix = transform_query([queriesDf['title'][0]], docVocabulary)
similarityMatrix = pairwise.cosine_similarity(queriesVectArr, docMatrixArr)

In [None]:
# len(similarityMatrix[0])
# similarityMatrix[0][0]
# type(similarityMatrix)  # numpy.ndarray