In [2]:
!pip install sklearn



In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("max_rows", 600)
from pathlib import Path  
import glob

In [4]:
#change directory path
#use text processed files
#change from .csv to .txt
directory_path = "Desktop/TFIDF/TFIDF_processed"

In [5]:
text_files = glob.glob(f"{directory_path}/*.txt")

In [6]:
text_files

['Desktop/TFIDF/TFIDF_processed/Factiva_Tampon_Tax.txt',
 'Desktop/TFIDF/TFIDF_processed/Twitter_Tampon_Tax_Political.txt',
 'Desktop/TFIDF/TFIDF_processed/Reddit_Tampon_Tax_Political.txt',
 'Desktop/TFIDF/TFIDF_processed/Twitter_Tampon_Tax_All.txt']

In [7]:
text_titles = [Path(text).stem for text in text_files]

In [8]:
text_titles

['Factiva_Tampon_Tax',
 'Twitter_Tampon_Tax_Political',
 'Reddit_Tampon_Tax_Political',
 'Twitter_Tampon_Tax_All']

In [9]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')

In [10]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [11]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())

In [12]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [13]:
#change slice words
tfidf_slice = tfidf_df[['remove','sign', 'end','repeal', 'vote', 'gov', 'eliminate']]
tfidf_slice.sort_index().round(decimals=2)

Unnamed: 0,remove,sign,end,repeal,vote,gov,eliminate
00_Document Frequency,3.0,4.0,3.0,4.0,3.0,4.0,4.0
Factiva_Tampon_Tax,0.02,0.02,0.07,0.03,0.04,0.0,0.03
Reddit_Tampon_Tax_Political,0.22,0.24,0.16,0.11,0.0,0.18,0.19
Twitter_Tampon_Tax_All,0.03,0.08,0.0,0.04,0.04,0.01,0.06
Twitter_Tampon_Tax_Political,0.0,0.02,0.0,0.02,0.12,0.03,0.02


In [14]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')

In [15]:
tfidf_df.stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,Factiva_Tampon_Tax,1019,0.0
1,Factiva_Tampon_Tax,10am,0.0
2,Factiva_Tampon_Tax,10th,0.0
3,Factiva_Tampon_Tax,11,0.0
4,Factiva_Tampon_Tax,110,0.0
...,...,...,...
51067,Twitter_Tampon_Tax_All,zucker,0.0
51068,Twitter_Tampon_Tax_All,zuckerberg,0.0
51069,Twitter_Tampon_Tax_All,zukiewicz,0.0
51070,Twitter_Tampon_Tax_All,zumiez,0.0


In [16]:
tfidf_df = tfidf_df.stack().reset_index()

In [17]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})

In [18]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(15)

Unnamed: 0,document,term,tfidf
10055,Factiva_Tampon_Tax,sale,0.209666
10046,Factiva_Tampon_Tax,said,0.19571
9869,Factiva_Tampon_Tax,right,0.179218
3170,Factiva_Tampon_Tax,document,0.162368
4050,Factiva_Tampon_Tax,feminine,0.132578
12699,Factiva_Tampon_Tax,year,0.120616
8558,Factiva_Tampon_Tax,percent,0.120482
7111,Factiva_Tampon_Tax,make,0.118268
12717,Factiva_Tampon_Tax,york,0.115299
12611,Factiva_Tampon_Tax,word,0.109733


In [19]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [20]:
top_tfidf[top_tfidf['term'].str.contains('petition')]

Unnamed: 0,document,term,tfidf


In [21]:
top_tfidf[top_tfidf['document'].str.contains('Reddit')]

Unnamed: 0,document,term,tfidf
27035,Reddit_Tampon_Tax_Political,business,0.275123
36904,Reddit_Tampon_Tax_Political,taxcut,0.244271
36023,Reddit_Tampon_Tax_Political,sign,0.236732
35177,Reddit_Tampon_Tax_Political,remove,0.222736
28322,Reddit_Tampon_Tax_Political,delete,0.220099
30547,Reddit_Tampon_Tax_Political,holiday,0.220099
29005,Reddit_Tampon_Tax_Political,eliminate,0.191206
30142,Reddit_Tampon_Tax_Political,gov,0.182101
26275,Reddit_Tampon_Tax_Political,asking,0.178189
36858,Reddit_Tampon_Tax_Political,tampon,0.165074


In [22]:
!pip install altair



In [23]:
import altair as alt
import numpy as np

In [24]:
import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization - update accordingly
term_list = ['remove','sign', 'end', 'repeal', 'vote', 'eliminate', 'nygovcuomo','jerrybrowngov']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 1300)