In [1]:
import pandas as pd
import numpy as np

In [2]:
OHCO = ['doc_source', 'doc_id', 'sent_num', 'token_num']

In [3]:
LIB = pd.read_csv('LIB.csv').set_index('doc_id')
LIB['doc_date'] = pd.to_datetime(LIB['doc_date'])
CORPUS = pd.read_csv('CORPUS.csv').set_index(OHCO)
VOCAB = pd.read_csv('VOCAB.csv').set_index('term_str')

In [4]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [5]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF

In [6]:
bag=OHCO[:1]

In [7]:
BOW = create_bow(CORPUS, bag)

In [8]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n
doc_source,term_str,Unnamed: 2_level_1
Breitbart,1,2
Breitbart,10,3
Breitbart,100,5
Breitbart,1000,2
Breitbart,100000,1
...,...,...
US News,zurich,6
US News,zuzana,1
US News,zverev,1
US News,zverevmolkino,1


In [9]:
TFIDF, DFIDF = get_tfidf(BOW)

In [10]:
DTM = BOW.n.unstack(fill_value=0)

In [11]:
DTM

term_str,0,00,000,0000,00001,001,007,01,01202017,02,...,zurawik,zurich,zurowski,zuzana,zverev,zverevmolkino,zycher,zygmunt,zz,zzzwall
doc_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Breitbart,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CNN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Daily Kos,1,0,0,20,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
Drudge Report,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Fox,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Google News,9,1,3,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Guardian,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NPR,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
New York Times,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Politico Magazine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
TFIDF

term_str,0,00,000,0000,00001,001,007,01,01202017,02,...,zurawik,zurich,zurowski,zuzana,zverev,zverevmolkino,zycher,zygmunt,zz,zzzwall
doc_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Breitbart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CNN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Daily Kos,0.000147,0.0,0.0,0.004315,0.000363,0.0,0.0,0.000177,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000363,0.0
Drudge Report,0.000403,0.0,0.0,0.0,0.0,0.0,0.00059,0.000485,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000498,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google News,0.002321,0.000473,0.001907,0.0,0.0,0.0,0.000378,0.0,0.000636,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000636
Guardian,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NPR,0.000689,0.0,0.0,0.0,0.0,0.0,0.001009,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
New York Times,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Politico Magazine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
VOCAB['mean_tfidf'] = TFIDF.mean()

In [14]:
vocab_filter = 'dfidf'
n_terms = 1000
pos_list = "NN NNS VB VBD VBG VBN VBP VBZ JJ JJR JJS RB RBR RBS".split() # Open categories with no proper nouns

In [15]:
VIDX = VOCAB.loc[VOCAB.max_pos.isin(pos_list)]\
    .sort_values(vocab_filter, ascending=False)\
    .head(n_terms).index

In [16]:
M = TFIDF[VIDX].fillna(0).groupby('doc_source').mean() # MUST FILLNA

In [17]:
M

term_str,climbed,crush,kinds,slice,backtoback,compassion,slaves,tunnel,tunnels,slashing,...,ceremonies,contractor,mirrors,resigns,decisive,resilience,mourning,misery,fitting,violates
doc_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Breitbart,0.001345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001345,0.0,0.0,0.0
CNN,0.0,0.000741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000371,0.0,0.000741,0.000371
Daily Kos,0.000123,0.0,0.001106,0.000246,0.0,0.000369,0.000246,0.0,0.0,0.0,...,0.0,0.000614,0.000123,0.000123,0.000123,0.000123,0.000123,0.0,0.000246,0.000614
Drudge Report,0.0,0.001681,0.0,0.0,0.0,0.0,0.0,0.000672,0.001008,0.000336,...,0.0,0.0,0.0,0.001344,0.0,0.000672,0.0,0.000336,0.0,0.0
Fox,0.00069,0.000345,0.000345,0.0,0.000345,0.000345,0.000345,0.0,0.00069,0.000345,...,0.001035,0.0,0.0,0.0,0.0,0.0,0.00207,0.0,0.0,0.0
Google News,0.000215,0.001505,0.000215,0.000215,0.000215,0.0,0.0,0.0,0.000215,0.000215,...,0.000215,0.00043,0.0,0.004946,0.000215,0.0,0.000215,0.00043,0.000215,0.00043
Guardian,0.0,0.0,0.0,0.000672,0.0,0.000336,0.001008,0.000336,0.000336,0.0,...,0.000336,0.0,0.000336,0.000672,0.0,0.0,0.0,0.000672,0.0,0.001008
NPR,0.0,0.0,0.0,0.000575,0.000575,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000575,0.0,0.0,0.0,0.0,0.0,0.0,0.0
New York Times,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000735,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000735,0.0,0.0,0.0,0.0,0.0
Politico Magazine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006643,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
import re 
from numpy.linalg import norm
from scipy.spatial.distance import pdist, squareform

In [19]:
TFIDF_L2 = M.apply(lambda x: x / norm(x), 1) # Euclidean

In [20]:
TFIDF_L2

term_str,climbed,crush,kinds,slice,backtoback,compassion,slaves,tunnel,tunnels,slashing,...,ceremonies,contractor,mirrors,resigns,decisive,resilience,mourning,misery,fitting,violates
doc_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Breitbart,0.068199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.068199,0.0,0.0,0.0
CNN,0.0,0.084895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.042448,0.0,0.084895,0.042448
Daily Kos,0.011479,0.0,0.103312,0.022958,0.0,0.034437,0.022958,0.0,0.0,0.0,...,0.0,0.057395,0.011479,0.011479,0.011479,0.011479,0.011479,0.0,0.022958,0.057395
Drudge Report,0.0,0.097889,0.0,0.0,0.0,0.0,0.0,0.039156,0.058733,0.019578,...,0.0,0.0,0.0,0.078311,0.0,0.039156,0.0,0.019578,0.0,0.0
Fox,0.046274,0.023137,0.023137,0.0,0.023137,0.023137,0.023137,0.0,0.046274,0.023137,...,0.069412,0.0,0.0,0.0,0.0,0.0,0.138823,0.0,0.0,0.0
Google News,0.008298,0.058086,0.008298,0.008298,0.008298,0.0,0.0,0.0,0.008298,0.008298,...,0.008298,0.016596,0.0,0.190853,0.008298,0.0,0.008298,0.016596,0.008298,0.016596
Guardian,0.0,0.0,0.0,0.063758,0.0,0.031879,0.095637,0.031879,0.031879,0.0,...,0.031879,0.0,0.031879,0.063758,0.0,0.0,0.0,0.063758,0.0,0.095637
NPR,0.0,0.0,0.0,0.046474,0.046474,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.046474,0.0,0.0,0.0,0.0,0.0,0.0,0.0
New York Times,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068519,0.0,0.0,...,0.0,0.0,0.0,0.0,0.068519,0.0,0.0,0.0,0.0,0.0
Politico Magazine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.182574,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
BOW=BOW.merge(TFIDF.stack().to_frame('tfidf'), how='left', left_index=True, right_index=True)

In [22]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tfidf
doc_source,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1
Breitbart,1,2,0.000203
Breitbart,10,3,0.000000
Breitbart,100,5,0.000506
Breitbart,1000,2,0.000655
Breitbart,100000,1,0.000455
...,...,...,...
US News,zurich,6,0.000659
US News,zuzana,1,0.000148
US News,zverev,1,0.000148
US News,zverevmolkino,1,0.000148


In [23]:
BOW.to_csv('BOW.csv')
DTM.to_csv('DTM.csv')
TFIDF.to_csv('TFIDF.csv')
TFIDF_L2.to_csv('TFIDF_L2.csv')