In [1]:
%load_ext autoreload
%autoreload 2
from utils.data import load_data, get_word_tokenized_corpus, get_data_property, get_data_chunks
from utils.embeddings import train_fasttext_embedding, get_chunk_embeddings, save_fasttext, load_fasttext
from utils.features import get_speed, get_volume, get_circuitousness
from main import setup_chunk_embeddings, setup

In [2]:
%reload_ext autoreload

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
import glob
import numpy as np
import pandas as pd
from os.path import exists
from sklearn.linear_model import Lasso

In [5]:
# Train Model
class Args:
#     model_name = 'fasttext_model/ft_model.model'
    model_name = 'fasttext_model/cc.en.300.bin'
    data_file = 'data/dblp-ref-0.json'
    chunk_embs_file = 'none'
    proj_dir = './saved/'
    limit = 30000
    T = 20
    train_model = True

args=Args()

ft_model, abstracts, citation_counts = setup(args)
chunk_embs = setup_chunk_embeddings(args, ft_model, abstracts)

Loading data...
Loading JSON...


KeyboardInterrupt: 

In [7]:
# SET THIS FLAG IF IMPORTING EMBS FROM MATLAB
from_MATLAB = True

# Load Chunks
class Args:
    model_name = 'fasttext_model/cc.en.300.bin'
    data_file = 'data/dblp-ref-0.json'
    chunk_embs_file = 'data/chunk_embs.txt'
    proj_dir = './saved/'
    limit = 30000
    T = 20
    train_model = False

args=Args()

In [8]:
if from_MATLAB:
    df = pd.read_table('./saved/data/toubia_embs.txt', dtype=float, header=None, sep=',').fillna(0).values
    chunk_embs = [np.trim_zeros(df[i], 'b').reshape(300,-1).transpose() for i in range(len(df))]
else:
    chunk_embs = setup_chunk_embeddings(args, None, None)

In [32]:
data = load_data(args)
abstracts = get_data_property(data, "abstract")
citation_counts = get_data_property(data, "n_citation")

In [10]:
features = {}
features['speed'] = [get_speed(chunk_emb)[-1] for chunk_emb in chunk_embs]
features['circuitousness'] = [get_circuitousness(chunk_emb) for chunk_emb in chunk_embs]
features['volume'] = [get_volume(chunk_emb) for chunk_emb in chunk_embs]

6.503984034351809
6.23665184074833
9.910414260139179
6.2875674865171565
1.4498403530923658
2.844508326158584
9.983229989926391
4.869660371268122
7.637076523917768
6.0618912684537145
6.0446629550795645
7.581659318456401
2.413717217397353
6.943354223747686
1.8671236236130038
3.647544647971237
6.928315770636235
4.288761175573236
1.5578933704627944
2.444685206062989
8.027385931935006
6.689785917119248
7.790798969596832
2.303196199770037
6.395569095918045
9.225376366512378
8.105619896972549
5.525770072812049
12.712920024433325
11.72548074204176
5.076189899257072
8.314291155043069
6.3534880887857765
10.029037393679033
10.26964216376402
3.5141989860245495
2.174912738217759
2.5597374430158966
7.10897021682157
7.190712403461959
2.107720686560286
10.125905373547992
14.146427859248252
9.700319955382964
6.921689688365714
1.6853077470839062
7.423995922489123
6.229226219490023
3.929885284334852
9.375731332475738
1.8063273531371191
6.520260190664519
5.665203403079996
8.882784063917423
5.0059507472389

In [34]:
print('Getting coefficients...')
for key, value in features.items():
    clf = Lasso(alpha=0.1)
    nan_vals = np.argwhere(np.isnan(features['volume']))
    
    clf.fit(np.delete(np.array(citation_counts), nan_vals).reshape(-1, 1), np.delete(value, nan_vals))
    print(f'{key} coeff {clf.coef_}')

Getting coefficients...
speed coeff [-0.00063021]
volume coeff [-0.00010053]


Getting coefficients...
speed coeff [-0.00064022]
volume coeff [-0.00021089]