In [1]:
import nltk
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


1) Data Processing

In [3]:
# a) remove digits, punctuation and making all lowercase in df
reg_for_digits = r'\d+'
reg_for_punctuation = r'[^\w\s]'
df['text'] = df['text'].replace(reg_for_digits, '',regex=True)
df['text'] = df['text'].replace(reg_for_punctuation, '',regex=True)
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
# b) tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vardanarakelyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vardanarakelyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vardanarakelyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vardanarakelyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
df['text'] = df['text'].apply(word_tokenize)

stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['text'] = df['text'].apply(remove_stopwords)

In [6]:
df.head()

Unnamed: 0,category,text
0,tech,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,"[worldcom, boss, left, books, alone, former, w..."
2,sport,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,"[ocean, twelve, raids, box, office, ocean, twe..."


Implement Bag of Words

In [7]:
# let's cut our df and take first 30 rows
new_df = df.iloc[:30].copy()
new_df.head()

Unnamed: 0,category,text
0,tech,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,"[worldcom, boss, left, books, alone, former, w..."
2,sport,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,"[ocean, twelve, raids, box, office, ocean, twe..."


In [8]:
new_df.shape

(30, 2)

In [9]:
# a) create vocabulary from the training set
word_list = []
for doc in new_df['text']:
    for word in doc:
        word_list.append(word)
vocabulary = set(word_list)

In [10]:
# we create here word : index pair to navigate in the future, enumerate helps with it
word_index = {word: i for i, word in enumerate(vocabulary)}
print(word_index)



In [11]:
# b) bow vector implementation
def bow_vector(tokens, word_index):
    vector = np.zeros(len(word_index))
    for token in tokens:
        if token in word_index:
            vector[word_index[token]] += 1
    return vector
# args bow vector adds second argument to function input, in pandas when we use apply, it automatically uses data as first argument, and using args (X,) we sent as tuple second argument
new_df['bow_vector'] = new_df['text'].apply(bow_vector, args=(word_index,))
new_df.head()


Unnamed: 0,category,text,bow_vector
0,tech,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, ..."
1,business,"[worldcom, boss, left, books, alone, former, w...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,sport,"[tigers, wary, farrell, gamble, leicester, say...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,entertainment,"[ocean, twelve, raids, box, office, ocean, twe...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


TF-IDF

In [12]:
# implement tf
def calc_tf(bow_vector):
    total_words = bow_vector.sum()
    tf = bow_vector / total_words
    return tf
new_df['tf'] = new_df['bow_vector'].apply(calc_tf)
new_df.head()


Unnamed: 0,category,text,bow_vector,tf
0,tech,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, ...","[0.0, 0.0, 0.002457002457002457, 0.0, 0.0, 0.0..."
1,business,"[worldcom, boss, left, books, alone, former, w...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0053763440860..."
2,sport,"[tigers, wary, farrell, gamble, leicester, say...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.007692307692307693, 0.0, 0.0..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,entertainment,"[ocean, twelve, raids, box, office, ocean, twe...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
def tf_vector_to_dict(tf_vector, index_map):
    """Converts a TF vector into a dictionary of {word: score}."""

    # Use a dictionary comprehension to build the result
    # It iterates through the vector, and for every score > 0,
    # it finds the word and adds it to the dictionary.
    return {
        index_map[index]: score
        for index, score in enumerate(tf_vector)
        if score > 0
    }
index_to_word = {index: word for word, index in word_index.items()}

new_df['tf_scores'] = new_df['tf'].apply(tf_vector_to_dict, args=(index_to_word,))
new_df.head()

Unnamed: 0,category,text,bow_vector,tf,tf_scores
0,tech,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, ...","[0.0, 0.0, 0.002457002457002457, 0.0, 0.0, 0.0...","{'plasma': 0.002457002457002457, 'viewers': 0...."
1,business,"[worldcom, boss, left, books, alone, former, w...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0053763440860...","{'transformed': 0.005376344086021506, 'investo..."
2,sport,"[tigers, wary, farrell, gamble, leicester, say...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.007692307692307693, 0.0, 0.0...","{'decide': 0.007692307692307693, 'progress': 0..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","{'hartlepool': 0.0038022813688212928, 'round':..."
4,entertainment,"[ocean, twelve, raids, box, office, ocean, twe...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","{'five': 0.006060606060606061, 'roberts': 0.01..."


In [14]:
# idf

bow_matrix = np.stack(new_df['bow_vector'].values)
bow_matrix

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
doc_freq_arr = (bow_matrix > 0).sum(axis=0)
doc_freq = {index_to_word[i]: count for i, count in enumerate(doc_freq_arr)}
doc_freq

{'detention': np.int64(1),
 'surface': np.int64(1),
 'plasma': np.int64(1),
 'decide': np.int64(1),
 'unspecified': np.int64(1),
 'based': np.int64(1),
 'transformed': np.int64(2),
 'sixyearold': np.int64(1),
 'viewers': np.int64(1),
 'allow': np.int64(2),
 'overheads': np.int64(1),
 'putting': np.int64(2),
 'lose': np.int64(1),
 'investor': np.int64(1),
 'six': np.int64(4),
 'important': np.int64(4),
 'programmes': np.int64(1),
 'honour': np.int64(1),
 'discriminatory': np.int64(1),
 'hartlepool': np.int64(1),
 'vying': np.int64(1),
 'plays': np.int64(1),
 'weekayear': np.int64(1),
 'behaving': np.int64(1),
 'friday': np.int64(3),
 'round': np.int64(2),
 'consortium': np.int64(2),
 'progress': np.int64(2),
 'reflect': np.int64(1),
 'shot': np.int64(1),
 'co': np.int64(1),
 'promised': np.int64(1),
 'russell': np.int64(1),
 'bbc': np.int64(7),
 'may': np.int64(7),
 'beheaded': np.int64(1),
 'coordinator': np.int64(1),
 'successfully': np.int64(1),
 'political': np.int64(3),
 'firm': np

In [16]:
total_docs = len(new_df)

In [17]:
import math

In [18]:
idf = {}
for word, count in doc_freq.items():
    score = math.log((total_docs / count + 1) + 1)
    idf[word] = score

idf

{'detention': 3.4657359027997265,
 'surface': 3.4657359027997265,
 'plasma': 3.4657359027997265,
 'decide': 3.4657359027997265,
 'unspecified': 3.4657359027997265,
 'based': 3.4657359027997265,
 'transformed': 2.833213344056216,
 'sixyearold': 3.4657359027997265,
 'viewers': 3.4657359027997265,
 'allow': 2.833213344056216,
 'overheads': 3.4657359027997265,
 'putting': 2.833213344056216,
 'lose': 3.4657359027997265,
 'investor': 3.4657359027997265,
 'six': 2.2512917986064953,
 'important': 2.2512917986064953,
 'programmes': 3.4657359027997265,
 'honour': 3.4657359027997265,
 'discriminatory': 3.4657359027997265,
 'hartlepool': 3.4657359027997265,
 'vying': 3.4657359027997265,
 'plays': 3.4657359027997265,
 'weekayear': 3.4657359027997265,
 'behaving': 3.4657359027997265,
 'friday': 2.4849066497880004,
 'round': 2.833213344056216,
 'consortium': 2.833213344056216,
 'progress': 2.833213344056216,
 'reflect': 3.4657359027997265,
 'shot': 3.4657359027997265,
 'co': 3.4657359027997265,
 'pro

In [19]:
idf_val = np.array(list(idf.values()))

In [20]:
def multiply(tf):
    return np.array(tf) * idf_val
new_df['tf_idf'] = new_df['tf'].apply(multiply)

In [21]:
new_df.head()

Unnamed: 0,category,text,bow_vector,tf,tf_scores,tf_idf
0,tech,"[tv, future, hands, viewers, home, theatre, systems, plasma, highdefinition, tvs, digital, video, recorders, moving, living, room, way, people, watch, tv, radically, different, five, years, time, according, expert, panel, gathered, annual, consumer, electronics, show, las, vegas, discuss, new, technologies, impact, one, favourite, pastimes, us, leading, trend, programmes, content, delivered, viewers, via, home, networks, cable, satellite, telecoms, companies, broadband, service, providers, front, rooms, portable, devices, one, talkedabout, technologies, ces, digital, personal, video, recorders, dvr, pvr, settop, boxes, like, us, tivo, uk, sky, system, allow, people, record, store, play, pause, forward, wind, tv, programmes, want, essentially, technology, allows, much, personalised, tv, also, builtin, ...]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.002457002457002457, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007371007371007371, 0.002457002457002457, 0.0, 0.002457002457002457, 0.0, 0.0, 0.0, 0.002457002457002457, 0.007371007371007371, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002457002457002457, 0.0, 0.0, 0.0, 0.0, 0.007371007371007371, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002457002457002457, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002457002457002457, 0.002457002457002457, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002457002457002457, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004914004914004914, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002457002457002457, 0.0, 0.002457002457002457, 0.0, 0.002457002457002457, 0.0, 0.0, 0.002457002457002457, 0.0, 0.0, 0.002457002457002457, 0.0, 0.0, ...]","{'plasma': 0.002457002457002457, 'viewers': 0.007371007371007371, 'allow': 0.002457002457002457, 'putting': 0.002457002457002457, 'important': 0.002457002457002457, 'programmes': 0.007371007371007371, 'reflect': 0.002457002457002457, 'bbc': 0.007371007371007371, 'firm': 0.002457002457002457, 'five': 0.002457002457002457, 'revenues': 0.002457002457002457, 'windows': 0.002457002457002457, 'capability': 0.004914004914004914, 'lack': 0.002457002457002457, 'viewer': 0.002457002457002457, 'set': 0.002457002457002457, 'stacey': 0.002457002457002457, 'growing': 0.002457002457002457, 'brand': 0.007371007371007371, 'sets': 0.004914004914004914, 'senior': 0.002457002457002457, 'mr': 0.004914004914004914, 'tivo': 0.007371007371007371, 'telecoms': 0.002457002457002457, 'show': 0.007371007371007371, 'alacarte': 0.002457002457002457, 'recorded': 0.002457002457002457, 'talkedabout': 0.002457002457002457, 'japan': 0.002457002457002457, 'uptake': 0.002457002457002457, 'new': 0.007371007371007371, 'group': 0.002457002457002457, 'technologies': 0.007371007371007371, 'electronics': 0.002457002457002457, 'issue': 0.002457002457002457, 'tvwatching': 0.002457002457002457, 'services': 0.002457002457002457, 'impact': 0.002457002457002457, 'network': 0.004914004914004914, 'getting': 0.002457002457002457, 'essentially': 0.002457002457002457, 'hanlon': 0.007371007371007371, 'recorders': 0.004914004914004914, 'together': 0.002457002457002457, 'website': 0.002457002457002457, 'entertainment': 0.002457002457002457, 'consumer': 0.004914004914004914, 'record': 0.002457002457002457, 'guide': 0.002457002457002457, 'portable': 0.002457002457002457, 'jolna': 0.002457002457002457, 'identity': 0.002457002457002457, 'liquid': 0.002457002457002457, 'forget': 0.002457002457002457, 'buttons': 0.002457002457002457, 'control': 0.002457002457002457, 'annual': 0.002457002457002457, 'future': 0.004914004914004914, 'lost': 0.002457002457002457, 'providers': 0.002457002457002457, 'digital': 0.004914004914004914, 'tv': 0.029484029484029485, 'rewind': 0.002457002457002457, 'added': 0.002457002457002457, 'suggested': 0.002457002457002457, 'also': 0.007371007371007371, 'many': 0.004914004914004914, 'tell': 0.002457002457002457, 'delivered': 0.002457002457002457, 'abiding': 0.002457002457002457, 'put': 0.002457002457002457, 'want': 0.0171990171990172, 'radically': 0.002457002457002457, 'reality': 0.002457002457002457, 'see': 0.002457002457002457, 'kind': 0.002457002457002457, 'know': 0.002457002457002457, 'book': 0.002457002457002457, 'programming': 0.002457002457002457, 'system': 0.002457002457002457, 'us': 0.0171990171990172, 'big': 0.002457002457002457, 'everywhere': 0.002457002457002457, 'vegas': 0.002457002457002457, 'personalised': 0.002457002457002457, 'like': 0.004914004914004914, 'pvr': 0.002457002457002457, 'allows': 0.002457002457002457, 'ces': 0.004914004914004914, 'leading': 0.002457002457002457, 'find': 0.004914004914004914, 'taking': 0.002457002457002457, 'already': 0.002457002457002457, 'futurologist': 0.002457002457002457, 'companies': 0.007371007371007371, 'devices': 0.004914004914004914, 'help': 0.002457002457002457, 'advertising': 0.004914004914004914, 'increasing': 0.002457002457002457, 'familiar': 0.002457002457002457, ...}","[0.0, 0.0, 0.008515321628500556, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02554596488550167, 0.006961212147558271, 0.0, 0.006961212147558271, 0.0, 0.0, 0.0, 0.0055314294806056395, 0.02554596488550167, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008515321628500556, 0.0, 0.0, 0.0, 0.0, 0.013549971632896422, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004781106017334922, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004781106017334922, 0.008515321628500556, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008515321628500556, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.017030643257001113, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008515321628500556, 0.0, 0.008515321628500556, 0.0, 0.004781106017334922, 0.0, 0.0, 0.008515321628500556, 0.0, 0.0, 0.0055314294806056395, 0.0, 0.0, ...]"
1,business,"[worldcom, boss, left, books, alone, former, worldcom, boss, bernie, ebbers, accused, overseeing, bn, bn, fraud, never, made, accounting, decisions, witness, told, jurors, david, myers, made, comments, questioning, defence, lawyers, arguing, mr, ebbers, responsible, worldcom, problems, phone, company, collapsed, prosecutors, claim, losses, hidden, protect, firm, shares, mr, myers, already, pleaded, guilty, fraud, assisting, prosecutors, monday, defence, lawyer, reid, weingarten, tried, distance, client, allegations, cross, examination, asked, mr, myers, ever, knew, mr, ebbers, make, accounting, decision, aware, mr, myers, replied, ever, know, mr, ebbers, make, accounting, entry, worldcom, books, mr, weingarten, pressed, replied, witness, mr, myers, admitted, ordered, false, accounting, entries, request, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005376344086021506, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005376344086021506, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.010752688172043012, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005376344086021506, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005376344086021506, 0.0, 0.0, 0.0, 0.0, 0.005376344086021506, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","{'transformed': 0.005376344086021506, 'investor': 0.005376344086021506, 'firm': 0.010752688172043012, 'never': 0.005376344086021506, 'distance': 0.005376344086021506, 'ceo': 0.005376344086021506, 'overseeing': 0.005376344086021506, 'jobs': 0.005376344086021506, 'mr': 0.06451612903225806, 'company': 0.005376344086021506, 'telecoms': 0.010752688172043012, 'witness': 0.010752688172043012, 'shareholders': 0.005376344086021506, 'trying': 0.005376344086021506, 'client': 0.005376344086021506, 'myers': 0.026881720430107527, 'boss': 0.016129032258064516, 'scott': 0.005376344086021506, 'two': 0.005376344086021506, 'left': 0.005376344086021506, 'asked': 0.005376344086021506, 'responsible': 0.005376344086021506, 'unknown': 0.005376344086021506, 'lost': 0.010752688172043012, 'monday': 0.005376344086021506, 'last': 0.005376344086021506, 'decisions': 0.005376344086021506, 'books': 0.010752688172043012, 'weingarten': 0.010752688172043012, 'prosecutors': 0.010752688172043012, 'testify': 0.005376344086021506, 'made': 0.010752688172043012, 'trial': 0.010752688172043012, 'aware': 0.005376344086021506, 'lawyer': 0.005376344086021506, 'faces': 0.005376344086021506, 'know': 0.005376344086021506, 'shares': 0.005376344086021506, 'assisting': 0.005376344086021506, 'petered': 0.005376344086021506, 'arguing': 0.005376344086021506, 'sullivan': 0.010752688172043012, 'workers': 0.005376344086021506, 'defence': 0.016129032258064516, 'protect': 0.005376344086021506, 'tried': 0.005376344086021506, 'entries': 0.005376344086021506, 'already': 0.005376344086021506, 'former': 0.016129032258064516, 'david': 0.005376344086021506, 'economist': 0.005376344086021506, 'claim': 0.005376344086021506, 'later': 0.005376344086021506, 'affable': 0.005376344086021506, 'mounted': 0.005376344086021506, 'lawyers': 0.010752688172043012, 'chief': 0.005376344086021506, 'request': 0.005376344086021506, 'reid': 0.005376344086021506, 'however': 0.005376344086021506, 'decision': 0.005376344086021506, 'collapsed': 0.010752688172043012, 'giant': 0.005376344086021506, 'declared': 0.005376344086021506, 'portray': 0.005376344086021506, 'cross': 0.005376344086021506, 'meanwhile': 0.005376344086021506, 'questioning': 0.005376344086021506, 'house': 0.005376344086021506, 'competition': 0.005376344086021506, 'relative': 0.005376344086021506, 'phone': 0.005376344086021506, 'graduate': 0.005376344086021506, 'increased': 0.005376344086021506, 'team': 0.005376344086021506, 'worldcom': 0.043010752688172046, 'replied': 0.010752688172043012, 'bn': 0.021505376344086023, 'alone': 0.005376344086021506, 'problems': 0.010752688172043012, 'cards': 0.005376344086021506, 'expected': 0.005376344086021506, 'fraud': 0.016129032258064516, 'looking': 0.005376344086021506, 'make': 0.010752688172043012, 'allegations': 0.005376344086021506, 'examination': 0.005376344086021506, 'jail': 0.005376344086021506, 'late': 0.005376344086021506, 'losses': 0.005376344086021506, 'firmly': 0.005376344086021506, 'mastermind': 0.005376344086021506, 'pleaded': 0.005376344086021506, 'abilities': 0.005376344086021506, 'finally': 0.005376344086021506, 'boom': 0.005376344086021506, 'entry': 0.005376344086021506, 'pe': 0.005376344086021506, 'paint': 0.005376344086021506, 'admission': 0.005376344086021506, ...}","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015232329806753852, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018632988724729713, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02092376504360552, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01210371934734675, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018632988724729713, 0.0, 0.0, 0.0, 0.0, 0.018632988724729713, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]"
2,sport,"[tigers, wary, farrell, gamble, leicester, say, rushed, making, bid, andy, farrell, great, britain, rugby, league, captain, decide, switch, codes, anybody, else, involved, process, still, way, away, going, next, stage, tigers, boss, john, wells, told, bbc, radio, leicester, moment, still, lot, unknowns, andy, farrell, least, medical, situation, whoever, take, going, take, big, big, gamble, farrell, persistent, knee, problems, operation, knee, five, weeks, ago, expected, another, three, months, leicester, saracens, believed, head, list, rugby, union, clubs, interested, signing, farrell, decides, move, man, game, move, across, union, wells, believes, would, better, playing, backs, least, initially, sure, could, make, step, league, union, involved, centre, ...]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.007692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","{'decide': 0.007692307692307693, 'progress': 0.007692307692307693, 'bbc': 0.007692307692307693, 'five': 0.007692307692307693, 'process': 0.007692307692307693, 'situation': 0.007692307692307693, 'bringing': 0.007692307692307693, 'position': 0.007692307692307693, 'row': 0.007692307692307693, 'backs': 0.007692307692307693, 'forwards': 0.007692307692307693, 'bid': 0.007692307692307693, 'boss': 0.007692307692307693, 'skills': 0.007692307692307693, 'use': 0.007692307692307693, 'lot': 0.007692307692307693, 'leicester': 0.023076923076923078, 'interested': 0.007692307692307693, 'struck': 0.007692307692307693, 'codes': 0.007692307692307693, 'readymade': 0.007692307692307693, 'rushed': 0.007692307692307693, 'stage': 0.007692307692307693, 'signing': 0.007692307692307693, 'operation': 0.007692307692307693, 'big': 0.015384615384615385, 'back': 0.007692307692307693, 'union': 0.023076923076923078, 'captain': 0.007692307692307693, 'saracens': 0.007692307692307693, 'think': 0.007692307692307693, 'unknowns': 0.007692307692307693, 'making': 0.007692307692307693, 'rugby': 0.023076923076923078, 'league': 0.023076923076923078, 'cross': 0.007692307692307693, 'away': 0.007692307692307693, 'prefer': 0.007692307692307693, 'club': 0.007692307692307693, 'believes': 0.007692307692307693, 'switch': 0.007692307692307693, 'decides': 0.007692307692307693, 'going': 0.015384615384615385, 'moment': 0.007692307692307693, 'gamble': 0.023076923076923078, 'option': 0.007692307692307693, 'sure': 0.007692307692307693, 'medical': 0.007692307692307693, 'problems': 0.007692307692307693, 'replacement': 0.007692307692307693, 'tigers': 0.015384615384615385, 'three': 0.007692307692307693, 'believed': 0.007692307692307693, 'cost': 0.007692307692307693, 'andy': 0.015384615384615385, 'expected': 0.007692307692307693, 'take': 0.015384615384615385, 'another': 0.007692307692307693, 'divide': 0.007692307692307693, 'head': 0.007692307692307693, 'make': 0.015384615384615385, 'farrell': 0.038461538461538464, 'would': 0.015384615384615385, 'clubs': 0.007692307692307693, 'least': 0.015384615384615385, 'else': 0.007692307692307693, 'within': 0.007692307692307693, 'say': 0.007692307692307693, 'man': 0.007692307692307693, 'still': 0.015384615384615385, 'step': 0.007692307692307693, 'centre': 0.007692307692307693, 'could': 0.007692307692307693, 'way': 0.007692307692307693, 'whoever': 0.007692307692307693, 'wary': 0.007692307692307693, 'next': 0.007692307692307693, 'balance': 0.007692307692307693, 'initially': 0.007692307692307693, 'across': 0.007692307692307693, 'list': 0.007692307692307693, 'said': 0.007692307692307693, 'told': 0.007692307692307693, 'britain': 0.007692307692307693, 'better': 0.007692307692307693, 'playing': 0.007692307692307693, 'move': 0.015384615384615385, 'great': 0.007692307692307693, 'jury': 0.007692307692307693, 'radio': 0.007692307692307693, 'whether': 0.007692307692307693, 'wells': 0.023076923076923078, 'game': 0.007692307692307693, 'persistent': 0.007692307692307693, 'john': 0.007692307692307693, 'england': 0.007692307692307693, 'ago': 0.007692307692307693, 'weeks': 0.007692307692307693, 'involved': 0.015384615384615385, 'anybody': 0.007692307692307693, ...}","[0.0, 0.0, 0.0, 0.026659506944613283, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.021793948800432433, 0.0, 0.0, 0.0, 0.0, 0.0, 0.014140611422022676, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.014968539608117795, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]"
3,sport,"[yeading, face, newcastle, fa, cup, premiership, side, newcastle, united, face, trip, ryman, premier, league, leaders, yeading, fa, cup, third, round, game, arguably, highlight, draw, potential, moneyspinner, nonleague, yeading, beat, slough, second, round, conference, side, exeter, city, knocked, doncaster, saturday, travel, old, trafford, meet, holders, manchester, united, january, arsenal, drawn, home, stoke, chelsea, play, host, scunthorpe, nonleague, side, draw, hinckley, united, held, brentford, goalless, draw, sunday, meet, league, one, leaders, luton, win, replay, martin, allen, team, griffin, park, number, premiership, teams, face, difficult, away, games, championship, sides, weekend, january, thirdplaced, everton, visit, plymouth, liverpool, travel, burnley, crystal, palace, go, sunderland, fulham, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0038022813688212928, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0076045627376425855, 0.0, 0.0, 0.0, 0.0, 0.0038022813688212928, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0038022813688212928, 0.0, 0.0038022813688212928, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0038022813688212928, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0038022813688212928, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","{'hartlepool': 0.0038022813688212928, 'round': 0.0076045627376425855, 'co': 0.0038022813688212928, 'teams': 0.0038022813688212928, 'martin': 0.0038022813688212928, 'tottenham': 0.0038022813688212928, 'chester': 0.0038022813688212928, 'stoke': 0.0076045627376425855, 'doncaster': 0.0038022813688212928, 'nottm': 0.0038022813688212928, 'go': 0.0038022813688212928, 'sunday': 0.0038022813688212928, 'blackburn': 0.0076045627376425855, 'drawn': 0.011406844106463879, 'two': 0.0038022813688212928, 'notts': 0.0038022813688212928, 'brom': 0.0076045627376425855, 'carling': 0.0076045627376425855, 'leicester': 0.0038022813688212928, 'ham': 0.0076045627376425855, 'wolves': 0.0038022813688212928, 'january': 0.011406844106463879, 'sunderland': 0.0076045627376425855, 'exeter': 0.0076045627376425855, 'strugglers': 0.0038022813688212928, 'host': 0.0038022813688212928, 'moneyspinner': 0.0038022813688212928, 'villa': 0.0076045627376425855, 'derby': 0.0038022813688212928, 'watford': 0.0076045627376425855, 'blackpool': 0.0038022813688212928, 'knocked': 0.0038022813688212928, 'ryman': 0.0038022813688212928, 'side': 0.015209125475285171, 'third': 0.0038022813688212928, 'park': 0.0038022813688212928, 'utd': 0.0076045627376425855, 'played': 0.0038022813688212928, 'plymouth': 0.0076045627376425855, 'leaders': 0.0076045627376425855, 'face': 0.015209125475285171, 'saturday': 0.0038022813688212928, 'reading': 0.0038022813688212928, 'already': 0.0038022813688212928, 'middlesbrough': 0.0076045627376425855, 'united': 0.015209125475285171, 'goalless': 0.0038022813688212928, 'end': 0.0038022813688212928, 'allen': 0.0038022813688212928, 'newcastle': 0.011406844106463879, 'keynes': 0.0038022813688212928, 'meet': 0.011406844106463879, 'games': 0.0038022813688212928, 'sides': 0.0038022813688212928, 'trip': 0.0038022813688212928, 'premier': 0.0038022813688212928, 'millwall': 0.0038022813688212928, 'oldham': 0.0038022813688212928, 'league': 0.011406844106463879, 'yeading': 0.015209125475285171, 'cardiff': 0.0076045627376425855, 'v': 0.12167300380228137, 'away': 0.011406844106463879, 'luton': 0.0076045627376425855, 'forest': 0.0038022813688212928, 'second': 0.0038022813688212928, 'draw': 0.011406844106463879, 'fulham': 0.0076045627376425855, 'arguably': 0.0038022813688212928, 'crewe': 0.0038022813688212928, 'hinckleybrentford': 0.0038022813688212928, 'beat': 0.0038022813688212928, 'liverpool': 0.0076045627376425855, 'brighton': 0.0076045627376425855, 'thirdplaced': 0.0038022813688212928, 'griffin': 0.0038022813688212928, 'hinckley': 0.0038022813688212928, 'held': 0.0038022813688212928, 'team': 0.0038022813688212928, 'wigan': 0.0038022813688212928, 'leeds': 0.0038022813688212928, 'holders': 0.0038022813688212928, 'slough': 0.0038022813688212928, 'crystal': 0.0076045627376425855, 'either': 0.0038022813688212928, 'trafford': 0.0038022813688212928, 'semifinalists': 0.0038022813688212928, 'fa': 0.0076045627376425855, 'entertain': 0.0038022813688212928, 'swindon': 0.0038022813688212928, 'conference': 0.0038022813688212928, 'season': 0.0038022813688212928, 'sheff': 0.0038022813688212928, 'preston': 0.0076045627376425855, 'hull': 0.0038022813688212928, 'number': 0.0038022813688212928, 'potential': 0.0038022813688212928, 'aston': 0.0076045627376425855, 'win': 0.0038022813688212928, 'travel': 0.0076045627376425855, ...}","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013177703052470444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.021545348624001644, 0.0, 0.0, 0.0, 0.0, 0.013177703052470444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013177703052470444, 0.0, 0.010772674312000822, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013177703052470444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013177703052470444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]"
4,entertainment,"[ocean, twelve, raids, box, office, ocean, twelve, crime, caper, sequel, starring, george, clooney, brad, pitt, julia, roberts, gone, straight, number, one, us, box, office, chart, took, weekend, ticket, sales, according, studio, estimates, sequel, follows, master, criminals, try, pull, three, major, heists, across, europe, knocked, last, week, number, one, national, treasure, third, place, wesley, snipes, blade, trinity, second, taking, rounding, top, five, animated, fable, polar, express, starring, tom, hanks, festive, comedy, christmas, kranks, ocean, twelve, box, office, triumph, marks, fourthbiggest, opening, december, release, us, three, films, lord, rings, trilogy, sequel, narrowly, beat, predecessor, ocean, eleven, took, opening, weekend, total, remake, film, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006060606060606061, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012121212121212121, 0.0, 0.0, 0.0, 0.0, 0.006060606060606061, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006060606060606061, 0.006060606060606061, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","{'five': 0.006060606060606061, 'roberts': 0.012121212121212121, 'fourthbiggest': 0.006060606060606061, 'less': 0.006060606060606061, 'kranks': 0.006060606060606061, 'zetajones': 0.006060606060606061, 'polar': 0.006060606060606061, 'fun': 0.006060606060606061, 'straight': 0.006060606060606061, 'raids': 0.006060606060606061, 'starring': 0.01818181818181818, 'new': 0.006060606060606061, 'clooney': 0.012121212121212121, 'labelling': 0.006060606060606061, 'rings': 0.006060606060606061, 'triumph': 0.006060606060606061, 'los': 0.006060606060606061, 'criminals': 0.006060606060606061, 'eleven': 0.012121212121212121, 'elliott': 0.006060606060606061, 'tom': 0.006060606060606061, 'complimentary': 0.006060606060606061, 'steven': 0.006060606060606061, 'pitt': 0.012121212121212121, 'sinatra': 0.006060606060606061, 'gone': 0.006060606060606061, 'christmas': 0.006060606060606061, 'oscarwinning': 0.006060606060606061, 'reunites': 0.006060606060606061, 'last': 0.006060606060606061, 'trivial': 0.006060606060606061, 'crime': 0.006060606060606061, 'knocked': 0.006060606060606061, 'predecessor': 0.006060606060606061, 'films': 0.006060606060606061, 'remake': 0.006060606060606061, 'bros': 0.006060606060606061, 'third': 0.006060606060606061, 'us': 0.01818181818181818, 'december': 0.006060606060606061, 'taking': 0.006060606060606061, 'caper': 0.006060606060606061, 'total': 0.006060606060606061, 'rat': 0.006060606060606061, 'review': 0.006060606060606061, 'matt': 0.006060606060606061, 'milder': 0.006060606060606061, 'major': 0.006060606060606061, 'soderbergh': 0.012121212121212121, 'allstar': 0.006060606060606061, 'chart': 0.006060606060606061, 'animated': 0.006060606060606061, 'ocean': 0.030303030303030304, 'however': 0.006060606060606061, 'europe': 0.006060606060606061, 'follows': 0.006060606060606061, 'week': 0.006060606060606061, 'movie': 0.006060606060606061, 'office': 0.01818181818181818, 'second': 0.006060606060606061, 'sales': 0.006060606060606061, 'beat': 0.006060606060606061, 'damon': 0.006060606060606061, 'unabashedly': 0.006060606060606061, 'angeles': 0.006060606060606061, 'joins': 0.006060606060606061, 'directed': 0.006060606060606061, 'express': 0.006060606060606061, 'narrowly': 0.006060606060606061, 'place': 0.006060606060606061, 'pull': 0.006060606060606061, 'york': 0.006060606060606061, 'garcia': 0.006060606060606061, 'three': 0.012121212121212121, 'number': 0.012121212121212121, 'marks': 0.006060606060606061, 'andy': 0.006060606060606061, 'good': 0.006060606060606061, 'warner': 0.006060606060606061, 'blade': 0.006060606060606061, 'gould': 0.006060606060606061, 'trinity': 0.006060606060606061, 'snipes': 0.006060606060606061, 'top': 0.006060606060606061, 'fellman': 0.006060606060606061, 'rounding': 0.006060606060606061, 'ticket': 0.006060606060606061, 'master': 0.006060606060606061, 'heists': 0.006060606060606061, 'release': 0.006060606060606061, 'catherine': 0.006060606060606061, 'lord': 0.006060606060606061, 'director': 0.006060606060606061, 'twelve': 0.01818181818181818, 'dubbed': 0.006060606060606061, 'wesley': 0.006060606060606061, 'julia': 0.006060606060606061, 'returns': 0.006060606060606061, 'took': 0.012121212121212121, 'hanks': 0.006060606060606061, ...}","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011793394842759474, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04200892003393608, 0.0, 0.0, 0.0, 0.0, 0.02100446001696804, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.017170989963977066, 0.02100446001696804, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]"


In [22]:
for category in new_df['category'].unique():

    category_df = new_df[new_df['category'] == category]

    category_tfidf_vectors = np.array(list(category_df['tf_idf']))

    mean_tfidf = category_tfidf_vectors.mean(axis=0)

    # Match the scores with their words
    scores = pd.Series(mean_tfidf, index=vocabulary)

    # Sort the words by score and get the top 10
    top_10 = scores.sort_values(ascending=False).head(10)

    print(top_10)


fbi           0.031507
argonaut      0.027837
mobile        0.027203
virus         0.026256
emails        0.026256
g             0.025271
people        0.023143
attachment    0.021004
technology    0.019108
phones        0.018874
dtype: float64
virgin      0.051134
blue        0.051134
ufj         0.037133
prices      0.033007
card        0.030137
worldcom    0.029813
sumitomo    0.028881
ebbers      0.026086
lg          0.025832
oil         0.025672
dtype: float64
v            0.046854
henman       0.020855
moya         0.017820
injury       0.017304
wilkinson    0.016743
cup          0.016310
farrell      0.014811
cole         0.014811
england      0.014667
club         0.014519
dtype: float64
film         0.060517
dicaprio     0.043322
scholl       0.027413
hill         0.027316
star         0.022559
wars         0.022181
ocean        0.021004
carpenter    0.020487
halloween    0.020487
sequel       0.019962
dtype: float64
hague         0.042728
mr            0.042359
howard        

In [33]:
tf_matrix = np.stack(new_df['tf'].values)
max_tf_per_word = tf_matrix.max(axis=0)

word_scores_df = pd.DataFrame({
    "word": list(vocabulary),
    "max_tf": max_tf_per_word,
    "idf": [idf[w] for w in vocabulary]
})

tf_high = word_scores_df["max_tf"].quantile(0.9)
tf_low = word_scores_df["max_tf"].quantile(0.1)
idf_high = word_scores_df["idf"].quantile(0.9)
idf_low = word_scores_df["idf"].quantile(0.1)

highTF_lowIDF = word_scores_df[
    (word_scores_df["max_tf"] >= tf_high) &
    (word_scores_df["idf"] <= idf_low)
].sort_values(by="max_tf", ascending=False)

lowTF_highIDF = word_scores_df[
    (word_scores_df["max_tf"] <= tf_low) &
    (word_scores_df["idf"] >= idf_high)
].sort_values(by="idf", ascending=False)

print("High TF & Low IDF words")
display(highTF_lowIDF.head(10))

print("Low TF & High IDF ")
display(lowTF_highIDF.head(10))


High TF & Low IDF words:


Unnamed: 0,word,max_tf,idf
137,mr,0.064516,1.609438
2587,film,0.056497,2.079442
1892,people,0.04375,1.673976
39,firm,0.043478,1.94591
821,election,0.041667,2.251292
2339,howard,0.040346,2.484907
1123,club,0.039867,2.484907
2476,technology,0.034884,2.251292
620,shares,0.032787,2.484907
1627,year,0.032787,1.553348



Low TF & High IDF words:


Unnamed: 0,word,max_tf,idf
2,plasma,0.002457,3.465736
1684,opt,0.002882,3.465736
1723,shylock,0.002882,3.465736
1720,broadcasters,0.002457,3.465736
1715,issues,0.002457,3.465736
1706,threat,0.002882,3.465736
1704,mean,0.002882,3.465736
1699,tivotogo,0.002457,3.465736
1657,ultimately,0.002457,3.465736
1768,reports,0.002882,3.465736
