## Preparation

#### Library

In [1]:
import os, sys, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Functions

In [2]:
from wordvector import corpus_by_date
from wordvector import clean_corpus
from wordvector import get_co_occurrence_matrix
from wordvector import reduce_dimension
from wordvector import get_k_vectors
from wordvector import get_v_vectors

## Vectorization

#### build a corpus

In [3]:
corpus = []; date = []; category = []
for y in range(2013,2023):
    for m in range(1,13):
        filename = os.path.join('headlines','hl_{}_{}.csv'.format(y,m))
        hl = pd.read_csv(filename,index_col=0)
        corpus.extend([sentence.split() for sentence in hl["headline"].values])
        date.extend(hl.index.values)
        category.extend(hl["category"].values)
corpus = corpus_by_date(corpus, date)

#### co-occurence-based vectorization

In [4]:
M, word2ind = get_co_occurrence_matrix(clean_corpus(corpus), 5)

  0%|          | 0/30754 [00:00<?, ?it/s]

In [63]:
M_reduced = reduce_dimension(M, k=100)

In [64]:
kvec = get_k_vectors(corpus,M_reduced,word2ind)

  0%|          | 0/3652 [00:00<?, ?it/s]

In [65]:
kvec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.322307,0.075486,0.107992,0.029978,0.063987,0.044809,0.093137,0.073999,0.075459,0.00562,...,-0.001862,0.000127,-0.000446,0.000317,-0.000118,0.000157,0.000622,0.000003,0.000297,0.001887
1,0.31977,0.094476,0.09413,0.032062,0.057834,0.056586,0.089619,0.078469,0.071903,0.000915,...,-0.001182,-0.001033,-0.000894,0.000477,-0.001596,0.000925,0.000308,-0.00039,-0.000138,0.000593
2,0.325721,0.084546,0.086369,0.020996,0.060622,0.049983,0.08133,0.069404,0.065841,0.002757,...,-0.001189,-0.000968,-0.001401,0.000094,0.00008,0.000428,0.000059,-0.000941,-0.000148,0.001056
3,0.323075,0.076732,0.087429,0.028626,0.061418,0.052007,0.097761,0.077029,0.072735,0.000638,...,-0.001378,-0.000162,-0.000956,-0.000063,-0.001379,0.000906,0.000307,-0.000486,-0.000631,0.000543
4,0.290161,0.155054,0.070703,0.065815,0.069663,0.013538,0.043947,0.056959,0.05207,0.023681,...,-0.005118,-0.001206,0.003754,0.001107,-0.001687,0.003612,0.004441,-0.001252,0.000177,-0.000677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3647,0.328773,0.092181,0.096453,0.044858,0.074654,0.056142,0.103139,0.081078,0.076867,0.004752,...,-0.003886,-0.000525,0.0003,0.000381,-0.000025,0.000582,-0.000267,-0.000139,0.000803,0.00069
3648,0.347113,0.062727,0.091905,0.04101,0.052441,0.039182,0.101356,0.087111,0.083052,0.003426,...,-0.002048,-0.000698,-0.000999,0.000496,0.000497,0.000487,-0.00077,-0.00007,0.000046,0.000971
3649,0.354379,0.068304,0.091738,0.027881,0.059632,0.041099,0.108892,0.090114,0.077761,0.000912,...,-0.002454,-0.000418,-0.000349,-0.000185,-0.00079,0.000923,0.000383,-0.000567,0.000271,0.001497
3650,0.33829,0.070188,0.108124,0.038913,0.061469,0.045194,0.103952,0.088586,0.091542,0.004145,...,-0.002478,-0.00082,-0.000967,-0.000032,0.000625,0.0011,0.000422,0.000022,0.000886,0.001091


#### meaning-based vectorization

In [67]:
vvec = get_v_vectors(corpus)

  0%|          | 0/3652 [00:00<?, ?it/s]

In [68]:
vvec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.144677,-0.178601,0.291130,0.080154,0.240732,0.000800,-0.099553,0.435326,-0.106868,-0.287485,...,-0.124826,-0.019366,0.125622,-0.128861,0.092092,-0.227611,-0.166696,0.016910,0.179582,-0.310120
1,-0.130094,-0.178177,0.236397,0.069069,0.271413,0.042298,-0.091184,0.388516,-0.057323,-0.158027,...,-0.070028,0.009132,0.135722,-0.145362,0.132197,-0.272009,-0.197425,-0.004172,0.098190,-0.173345
2,-0.156511,-0.183585,0.286810,0.074090,0.191668,0.086729,-0.107352,0.476699,-0.035141,-0.199644,...,-0.149571,-0.072721,0.124153,-0.158085,0.066159,-0.195173,-0.151533,-0.008131,0.208380,-0.236797
3,-0.119239,-0.161327,0.307250,0.064817,0.298752,0.017856,-0.049865,0.438223,-0.083116,-0.269743,...,-0.103645,0.001200,0.100431,-0.202385,0.078297,-0.205482,-0.172604,0.067207,0.173215,-0.164071
4,0.077108,-0.173336,0.211704,0.161706,0.220347,-0.192786,-0.131736,0.463637,-0.110008,-0.045480,...,0.137187,-0.064712,-0.048394,-0.412224,0.017646,-0.063779,-0.107611,-0.007679,0.141222,-0.191499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3647,-0.022872,-0.157255,0.193855,0.116880,0.295192,-0.029666,-0.057453,0.413164,-0.085900,-0.207291,...,-0.119553,0.089206,0.010060,-0.187933,0.107351,-0.110899,-0.041868,0.024566,0.114257,-0.160379
3648,-0.086713,-0.099205,0.291241,0.099464,0.277648,-0.051381,-0.096855,0.376678,-0.049691,-0.088312,...,-0.077161,0.115946,0.103897,-0.218030,0.172115,-0.179683,-0.058983,0.047482,0.129600,-0.098417
3649,-0.118584,-0.119206,0.249241,0.126262,0.321993,-0.100844,-0.077940,0.390332,-0.053299,-0.158032,...,-0.144494,0.105612,0.023059,-0.140309,0.120257,-0.196900,-0.041192,0.112067,0.177539,-0.117014
3650,-0.147468,-0.179974,0.246522,0.128213,0.286303,-0.031809,-0.068339,0.395862,-0.054966,-0.040738,...,-0.056348,0.085529,0.093920,-0.185504,0.123886,-0.211807,-0.060331,-0.016238,0.139593,-0.092242


#### save results

In [71]:
kvec.to_csv("kvec.csv")

In [72]:
vvec.to_csv("vvec.csv")

In [73]:
pd.DataFrame(set(date)).to_csv("date.csv")