In [2]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

### Loading data

In [7]:
corpus = []
for d in range(1400):
    f = open("./Indexing/cranfield/d/" + str(d + 1) + ".txt")
    corpus.append(f.read())

In [8]:
queries = []
for q in range(1, 225, 1):
    f = open("./Indexing/cranfield/q/" + str(q) + ".txt")
    queries.append(f.read())

In [9]:
references = []
for r in range(1, 225, 1):
    f = open("./Indexing/cranfield/r/" + str(q) + ".txt")
    references.append([int(x) for x in f.read().split('\n')[:-1]])

In [10]:
references[0]

[656, 1313, 1317, 1316, 1318, 1319, 1157, 1274, 1286]

In [11]:
for r in references:
    if len(r) != 9:
        print(r.count('\n')) # => all references contain 9 values

## TF-IDF

In [3]:
tfidf_vectorizer = TfidfVectorizer()

#### Cosine meassure

In [57]:
precision_list, recall_list, f_measure_list = [], [], []
for i in range(len(queries)):
    query = queries[i]
    reference = references[i]
    
    corpus.append(query)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    tfidf_sim = np.array(cosine_similarity(tfidf_matrix[len(corpus) - 1], tfidf_matrix[0:(len(corpus) - 1)])[0])
    tfidf_topRelevant = tfidf_sim.argsort()[-9:][::-1] + 1
    
    precision = precision_score(reference, tfidf_topRelevant, average='micro')
    precision_list.append(precision)
    recall = recall_score(reference, tfidf_topRelevant, average='micro')
    recall_list.append(recall)
    f_measure = f1_score(reference, tfidf_topRelevant, average='micro')
    f_measure_list.append(f_measure)
    
    corpus.pop()

In [60]:
tfidf_cos_precision = sum(precision_list)/len(precision_list)
tfidf_cos_recall = sum(recall_list)/len(recall_list)
tfidf_cos_f_measure = sum(f_measure_list)/len(f_measure_list)

In [61]:
print('precision: ', tfidf_cos_precision ,', recall: ', tfidf_cos_recall, ', f-measure: ' ,tfidf_cos_f_measure)

precision:  0.0014880952381 , recall:  0.0014880952381 , f-measure:  0.0014880952381


#### Euclidian distance

In [63]:
precision_list, recall_list, f_measure_list = [], [], []
for i in range(len(queries)):
    query = queries[i]
    reference = references[i]
    
    corpus.append(query)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    tfidf_sim = np.array(euclidean_distances(tfidf_matrix[len(corpus) - 1], tfidf_matrix[0:(len(corpus) - 1)])[0])
    tfidf_topRelevant = tfidf_sim.argsort()[-9:][::-1] + 1
    
    precision = precision_score(reference, tfidf_topRelevant, average='micro')
    precision_list.append(precision)
    recall = recall_score(reference, tfidf_topRelevant, average='micro')
    recall_list.append(recall)
    f_measure = f1_score(reference, tfidf_topRelevant, average='micro')
    f_measure_list.append(f_measure)
    
    corpus.pop()

In [64]:
tfidf_eukl_precision = sum(precision_list)/len(precision_list)
tfidf_eukl_recall = sum(recall_list)/len(recall_list)
tfidf_eukl_f_measure = sum(f_measure_list)/len(f_measure_list)

In [65]:
print('precision: ', tfidf_eukl_precision ,', recall: ', tfidf_eukl_recall, ', f-measure: ' ,tfidf_eukl_f_measure)

precision:  0.0 , recall:  0.0 , f-measure:  0.0


### Term Frequency

In [5]:
count_vectorizer = CountVectorizer()

#### Cosine meassure

In [12]:
precision_list, recall_list, f_measure_list = [], [], []
for i in range(len(queries)):
    query = queries[i]
    reference = references[i]
    
    corpus.append(query)
    
    count_matrix = count_vectorizer.fit_transform(corpus)
    count_sim = np.array(cosine_similarity(count_matrix[len(corpus) - 1], count_matrix[0:(len(corpus) - 1)])[0])
    count_topRelevant = count_sim.argsort()[-9:][::-1] + 1
    
    precision = precision_score(reference, count_topRelevant, average='micro')
    precision_list.append(precision)
    recall = recall_score(reference, count_topRelevant, average='micro')
    recall_list.append(recall)
    f_measure = f1_score(reference, count_topRelevant, average='micro')
    f_measure_list.append(f_measure)
    
    corpus.pop()

In [13]:
count_cos_precision = sum(precision_list)/len(precision_list)
count_cos_recall = sum(recall_list)/len(recall_list)
count_cos_f_measure = sum(f_measure_list)/len(f_measure_list)

In [14]:
print('precision: ', count_cos_precision ,', recall: ', count_cos_recall, ', f-measure: ' ,count_cos_f_measure)

precision:  0.0 , recall:  0.0 , f-measure:  0.0


#### Euclidian distance

In [15]:
precision_list, recall_list, f_measure_list = [], [], []
for i in range(len(queries)):
    query = queries[i]
    reference = references[i]
    
    corpus.append(query)
    
    count_matrix = count_vectorizer.fit_transform(corpus)
    count_sim = np.array(euclidean_distances(count_matrix[len(corpus) - 1], count_matrix[0:(len(corpus) - 1)])[0])
    count_topRelevant = count_sim.argsort()[-9:][::-1] + 1
    
    precision = precision_score(reference, count_topRelevant, average='micro')
    precision_list.append(precision)
    recall = recall_score(reference, count_topRelevant, average='micro')
    recall_list.append(recall)
    f_measure = f1_score(reference, count_topRelevant, average='micro')
    f_measure_list.append(f_measure)
    
    corpus.pop()

In [16]:
count_cos_precision = sum(precision_list)/len(precision_list)
count_cos_recall = sum(recall_list)/len(recall_list)
count_cos_f_measure = sum(f_measure_list)/len(f_measure_list)

In [17]:
print('precision: ', count_cos_precision ,', recall: ', count_cos_recall, ', f-measure: ' ,count_cos_f_measure)

precision:  0.111111111111 , recall:  0.111111111111 , f-measure:  0.111111111111


### Binary representation

#### Cosine meassure

In [None]:
precision_list, recall_list, f_measure_list = [], [], []
for i in range(len(queries)):
    query = queries[i]
    reference = references[i]
    
    corpus.append(query)

    count_matrix = count_vectorizer.fit_transform(corpus)
    binary_matrix = [[True if x>0 else False for x in row] for row in count_matrix.toarray()]
    binary_sim = np.array(cosine_similarity(binary_matrix[len(corpus) - 1], binary_matrix[0:(len(corpus) - 1)])[0])
    binary_topRelevant = binary_sim.argsort()[-9:][::-1] + 1
    
    precision = precision_score(reference, binary_topRelevant, average='micro')
    precision_list.append(precision)
    recall = recall_score(reference, binary_topRelevant, average='micro')
    recall_list.append(recall)
    f_measure = f1_score(reference, binary_topRelevant, average='micro')
    f_measure_list.append(f_measure)
    
    corpus.pop()



In [None]:
binary_cos_precision = sum(precision_list)/len(precision_list)
binary_cos_recall = sum(recall_list)/len(recall_list)
binary_cos_f_measure = sum(f_measure_list)/len(f_measure_list)

In [None]:
print('precision: ', binary_cos_precision ,', recall: ', binary_cos_recall, ', f-measure: ' ,binary_cos_f_measure)

#### Euclidian distance

In [None]:
precision_list, recall_list, f_measure_list = [], [], []
for i in range(len(queries)):
    query = queries[i]
    reference = references[i]
    
    corpus.append(query)

    count_matrix = count_vectorizer.fit_transform(corpus)
    binary_matrix = [[True if x>0 else False for x in row] for row in count_matrix.toarray()]
    binary_sim = np.array(euclidean_distances(binary_matrix[len(corpus) - 1], binary_matrix[0:(len(corpus) - 1)])[0])
    binary_topRelevant = binary_sim.argsort()[-9:][::-1] + 1
    
    precision = precision_score(reference, binary_topRelevant, average='micro')
    precision_list.append(precision)
    recall = recall_score(reference, binary_topRelevant, average='micro')
    recall_list.append(recall)
    f_measure = f1_score(reference, binary_topRelevant, average='micro')
    f_measure_list.append(f_measure)
    
    corpus.pop()

In [None]:
binary_cos_precision = sum(precision_list)/len(precision_list)
binary_cos_recall = sum(recall_list)/len(recall_list)
binary_cos_f_measure = sum(f_measure_list)/len(f_measure_list)

In [None]:
print('precision: ', binary_cos_precision ,', recall: ', binary_cos_recall, ', f-measure: ' ,binary_cos_f_measure)