# Contextually Propagated Term Weights for Document Representation

In [1]:
import gensim
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

## word2vec

In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

## Load reuters training/test data

In [3]:
def read_data(file):
    data = []
    labels = []
    for line in file:
        splitted = line.split()
        labels.append(splitted[0])
        words = " ".join(splitted[1:])
        data.append(words)
    return data,labels

In [4]:
with open('data/reuters/r8-train-all-terms.txt') as f:
    train_file = f.readlines()

In [5]:
# load training data
train_data,train_labels = read_data(train_file)
train_data = pd.concat([pd.DataFrame(train_data, columns=['text']), 
                        pd.DataFrame(train_labels, columns=['label'])], 
                       axis=1)
train_data.head()

Unnamed: 0,text,label
0,champion products ch approves stock split cham...,earn
1,computer terminal systems cpml completes sale ...,acq
2,cobanco inc cbco year net shr cts vs dlrs net ...,earn
3,am international inc am nd qtr jan oper shr lo...,earn
4,brown forman inc bfd th qtr net shr one dlr vs...,earn


In [6]:
with open('data/reuters/r8-test-all-terms.txt') as f:
    test_file = f.readlines()

In [7]:
#loading test data
test_data, test_labels = read_data(test_file)
test_data = pd.concat([pd.DataFrame(test_data, columns=['text']), 
                        pd.DataFrame(test_labels, columns=['label'])], 
                       axis=1)
test_data.head()

Unnamed: 0,text,label
0,asian exporters fear damage from u s japan rif...,trade
1,china daily says vermin eat pct grain stocks a...,grain
2,australian foreign ship ban ends but nsw ports...,ship
3,sumitomo bank aims at quick recovery from merg...,acq
4,amatil proposes two for five bonus share issue...,earn


In [8]:
# merge test / training data
all_data = pd.concat([train_data,test_data],axis=0)
all_data.head()

Unnamed: 0,text,label
0,champion products ch approves stock split cham...,earn
1,computer terminal systems cpml completes sale ...,acq
2,cobanco inc cbco year net shr cts vs dlrs net ...,earn
3,am international inc am nd qtr jan oper shr lo...,earn
4,brown forman inc bfd th qtr net shr one dlr vs...,earn


## TF-IDF baseline

In [9]:
def tf_idf(train,test):
    cv = CountVectorizer()
    x_train = cv.fit_transform(train)
    tfidf = TfidfTransformer() 
    train_tfidf = tfidf.fit_transform(x_train)
    x_test = cv.transform(test)
    x_test_tfidf = tfidf.transform(x_test)
    return (train_tfidf,x_test_tfidf)

## CPTW

In [10]:
all_words = [w for row in all_data['text'].values for w in row.split() if w in model.vocab]
unique_words = {words for words in all_words}
len(unique_words)

15587

In [11]:
model.init_sims()
def unique_words_model(model, unique_words):
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    new_vectors_norm = []

    for idx in range(len(model.vocab)):
        word = model.index2entity[idx]
        vec = model.vectors[idx]
        vocab = model.vocab[word]
        vec_norm = model.vectors_norm[idx]
        if word in unique_words:
            vocab.index = len(new_index2entity)
            new_index2entity.append(word)
            new_vocab[word] = vocab
            new_vectors.append(vec)
            new_vectors_norm.append(vec_norm)

    model.vocab = new_vocab
    model.vectors = np.array(new_vectors)
    model.index2entity = np.array(new_index2entity)
    model.index2word = np.array(new_index2entity)
    model.vectors_norm = np.array(new_vectors_norm)

In [12]:
unique_words_model(model, unique_words)

In [13]:
def similarity_matrix():
    return cosine_similarity(model.vectors)

In [14]:
def cptw(document, similarities, t = 0.8):
    result = np.zeros(len(model.vocab))
    for idx in range(len(model.vocab)):
        word = model.index2entity[idx]
        # frq would be 0 so cptw would also be 0
        if word in document.split():
            df = pd.DataFrame(similarities[idx], columns=["cossim"])
            df = df[df["cossim"] > t] # get all words which are more similar then threshold
            df['word'] = df.apply(lambda row: model.index2entity[row.name], axis=1)
            df['gamma'] = df.apply(lambda row: document.split().count(row.word)*row.cossim, axis=1)
            gamma = df['gamma'].sum()
            cos_sims = df['cossim'].sum()
            alpha_j = 1 / cos_sims
            result[idx] = alpha_j * gamma
        else:
            result[idx] = 0.0
    return result

# Experiments

In [15]:
# split 80% training, 20% test
labels = all_data['label'].values
texts = all_data['text'].values
X_data_train, X_data_test, y_data_train, y_data_test = train_test_split(texts, labels, test_size=0.20)

In [16]:
similarities = similarity_matrix()

In [17]:
k_parameters = list(range(1,20))

## TF-IDF experiments

In [18]:
# TF IDF
kf = KFold(n_splits=5, shuffle=True)
X = X_data_train
y = y_data_train
kf.get_n_splits(X)
tf_idf_scores = []
for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for k in k_parameters:
            train, test = tf_idf(X_train,X_test)
            clf = KNeighborsClassifier(k,weights='distance',n_jobs=30)
            clf.fit(train, y_train)

            tf_idf_scores.append((accuracy_score(y_test, clf.predict(test)),k))

In [19]:
sorted_tf_idf_scores = sorted(tf_idf_scores, key=lambda tup: tup[0], reverse=True)
_, best_k = sorted_tf_idf_scores[0]

In [20]:
# run on whole train and then test_data
train, test = tf_idf(X_data_train,X_data_test)
clf = KNeighborsClassifier(best_k,weights='distance',n_jobs=30)
clf.fit(train, y_data_train)
print("macro:", f1_score(y_data_test, clf.predict(test), average='macro'))
print("micro:",f1_score(y_data_test, clf.predict(test), average='micro'))
print("accuracy:",accuracy_score(y_data_test, clf.predict(test)))

macro: 0.8449271132860058
micro: 0.9263843648208469
accuracy: 0.9263843648208469


## CPTW experiments

In [21]:
k_parameters = list(range(1,3))
tau_parameters = [0.8,0.9]

In [22]:
kf = KFold(n_splits=5, shuffle=True)
X = X_data_train[:30]
y = y_data_train[:30]
kf.get_n_splits(X)
cptw_scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    for k in k_parameters:
        for t in tau_parameters:
            print(t,k)
            train_scores = []
            for document in X_train:
                train_scores.append(cptw(document, similarities,t=t))

            test_scores = []
            for document in X_test:
                test_scores.append(cptw(document, similarities,t=t))

            clf = KNeighborsClassifier(k,weights='distance',n_jobs=40)
            clf.fit(train_scores, y_train)
            acc = accuracy_score(y_test, clf.predict(test_scores))
            cptw_scores.append((acc,k,t))  

0.8 1
0.9 1
0.8 2
0.9 2
0.8 1
0.9 1
0.8 2
0.9 2
0.8 1
0.9 1
0.8 2
0.9 2
0.8 1
0.9 1
0.8 2
0.9 2
0.8 1
0.9 1
0.8 2
0.9 2


In [23]:
sorted_cptw_scores = sorted(cptw_scores, key=lambda tup: tup[0], reverse=True)
_, best_k_cp, best_t_cp = sorted_cptw_scores[0]

In [27]:
train_scores = []
for document in X_data_train[:100]:
    train_scores.append(cptw(document, similarities,t=best_t_cp))

test_scores = []
for document in X_data_test[:20]:
    test_scores.append(cptw(document, similarities,t=best_t_cp))

clf = KNeighborsClassifier(best_k_cp,weights='distance',n_jobs=40)
clf.fit(train_scores, y_data_train[:100])

print("accuracy:", accuracy_score(y_data_test[:20], clf.predict(test_scores)))
print("macro:", f1_score(y_data_test[:20], clf.predict(test_scores), average='macro'))
print("micro:",f1_score(y_data_test[:20], clf.predict(test_scores), average='micro'))

accuracy: 0.65
macro: 0.22979797979797978
micro: 0.65


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
