# Contextually Propagated Term Weights for Document Representation

## word2vec

In [23]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

## Load reuters training/test data

In [2]:
with open('data/reuters/r8-train-all-terms.txt') as f:
    train_file = f.readlines()

In [3]:
# load training data
train_data = []
train_labels = []
for line in train_file:
    splitted = line.split()
    train_labels.append(splitted[0])
    words = " ".join(splitted[1:])
    train_data.append(words)

In [4]:
with open('data/reuters/r8-test-all-terms.txt') as f:
    test_file = f.readlines()

In [5]:
#loading test data
test_data = []
test_labels = []
for line in test_file:
    splitted = line.split()
    test_labels.append(splitted[0])
    words = " ".join(splitted[1:])
    test_data.append(words)

## TF-IDF baseline

In [9]:
from sklearn.neighbors import NearestNeighbors 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 

# CountVectorizer for transforming the text 
cv = CountVectorizer()
x_train = cv.fit_transform(train_data) 
tfidf = TfidfTransformer() 
train_tfidf = tfidf.fit_transform(x_train)

knn = NearestNeighbors(n_neighbors=1) 
knn.fit(train_tfidf)

x_test = cv.transform(test_data)
x_test_tfidf = tfidf.transform(x_test)

results = knn.kneighbors(x_test_tfidf, return_distance=False)

### TF-IDF Scores

In [10]:
from sklearn.metrics import f1_score
pred_labels = []
for index in results:
    pred_labels.append(train_labels[int(index)])
print("micro:", f1_score(test_labels, pred_labels, average = "micro"))
print("macro:", f1_score(test_labels, pred_labels, average = "macro"))

micro: 0.840109639104614
macro: 0.7862565745939916


## CPTW