In [1]:
import pandas as pd
import numpy as np
import re
from bokeh.charts import Bar, Scatter, output_notebook, show, output_file
from bokeh.charts.attributes import CatAttr, color
from bokeh.models import HoverTool, Range1d, Span, LabelSet, ColumnDataSource, Title, NumeralTickFormatter
from bokeh.plotting import figure
import matplotlib.pyplot as plt



In [2]:
data_directory = "../Data/"
counts_file = data_directory + "counts.csv"
tfidf_file = data_directory + "tfidf.csv"

df_counts = pd.read_csv(counts_file)
df_tfidf = pd.read_csv(tfidf_file)
df_counts = df_counts.dropna(subset = ['term'])
df_tfidf = df_tfidf.dropna(subset = ['term'])

##### TF-IDF doesn't seem very useful, as the same words pop out for both sites

In [None]:
df_tfidf

In [None]:
# Normalize words counts by the site's total word counts
df_counts['CNN'] = (df_counts['CNN'] / df_counts['CNN'].sum()) * 100
df_counts['Fox'] = (df_counts['Fox'] / df_counts['Fox'].sum()) * 100

In [None]:
df_counts[:2]

In [None]:
df_counts = pd.melt(df_counts, id_vars = 'term', var_name = 'site', value_name = 'term_pct')

In [None]:
df_counts[:2]

In [None]:
TOP_NUMBER = 5
top_CNN = df_counts[df_counts['site'] == 'CNN'].sort_values(by = 'term_pct', ascending = False)[: TOP_NUMBER]
top_Fox = df_counts[df_counts['site'] == 'Fox'].sort_values(by = 'term_pct', ascending = False)[: TOP_NUMBER]

top_CNN_term = top_CNN['term'].tolist()
top_Fox_terms = top_Fox['term'].tolist()
top_terms = list(set(top_CNN_term + top_Fox_terms))
plot_data = df_counts.loc[df_counts['term'].isin(top_terms)]
plot_data['term'] = plot_data['term'].str.title()

In [None]:
# Let's plot this with Bokeh, making an HTML file
p = Bar(plot_data, label=CatAttr(columns=['term'], sort=True), values='term_pct',
         group = "site", legend = "top_right", tools="previewsave", height=600, width=900,
        title="Top Terms for CNN and Fox", xlabel="Term", ylabel="Percentage of Terms")

# Fix bar width issue
for r in p.renderers:
    try:
        r.glyph.width = 0.33
    except AttributeError:
        pass

msg = """Note: Data are from CNN.com and Foxnews.com.  Common and one-letter words have been excluded."""
caption = Title(text=msg, align='left', text_font_size='8pt')
p.add_layout(caption, 'below')

output_file("../Output/term_pct.html")
show(p)

In [None]:
# We can make a similar plot using Matplotlib (ggplot is buggy), producing a PNG image
%matplotlib inline

plot_data = plot_data.sort_values(by = 'term')
cnn_data = plot_data.loc[plot_data['site'] == 'CNN']
fox_data = plot_data.loc[plot_data['site'] == 'Fox']
cnn = cnn_data['term_pct'].tolist()
fox = fox_data['term_pct'].tolist()
ind = np.arange(len(cnn))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(ind, cnn, width, color='r')
rects2 = ax.bar(ind + width, fox, width, color='y')

# add some text for labels, title and axes ticks
ax.set_title('Term Frequency by News Source', fontsize = 10)
ax.set_ylabel('Percentage of Terms', fontsize = 8)
ax.set_xticks(ind + width)
ax.set_xticklabels(tuple(cnn_data['term'].tolist()), fontsize = 4, rotation = 45)

ax.legend((rects1[0], rects2[0]), ('CNN', 'Fox'), prop={'size':6})

fig.savefig('../Output/term_pct.png', dpi = 250)

In [None]:
# Calculate the term percentage divided by the average term percentage
term_data = df_counts.groupby('term')['term_pct'].mean().reset_index()
term_data = term_data.rename(columns = {'term_pct': 'term_pct_ave'})
term_data = df_counts.merge(term_data, on = ['term'], how = 'inner')
term_data['term_pct'] = term_data['term_pct'] / term_data['term_pct_ave']

In [None]:
TOP_NUMBER = 5
top_CNN = term_data[term_data['site'] == 'CNN'].sort_values(by = 'term_pct', ascending = False)[: TOP_NUMBER]
top_Fox = term_data[term_data['site'] == 'Fox'].sort_values(by = 'term_pct', ascending = False)[: TOP_NUMBER]

top_CNN_term = top_CNN['term'].tolist()
top_Fox_terms = top_Fox['term'].tolist()
top_terms = list(set(top_CNN_term + top_Fox_terms))
plot_data = term_data.loc[term_data['term'].isin(top_terms)]
plot_data['term'] = plot_data['term'].str.title()

In [None]:
plot_data = plot_data.sort_values(by = 'term')
cnn_data = plot_data.loc[plot_data['site'] == 'CNN']
fox_data = plot_data.loc[plot_data['site'] == 'Fox']
cnn = cnn_data['term_pct'].tolist()
fox = fox_data['term_pct'].tolist()
ind = np.arange(len(cnn))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(ind, cnn, width, color='r')
rects2 = ax.bar(ind + width, fox, width, color='y')

# add some text for labels, title and axes ticks
ax.set_title('Term Frequency by News Source', fontsize = 10)
ax.set_ylabel('Percentage of Terms divided by Average Percentage', fontsize = 8)
ax.set_xticks(ind + width)
ax.set_xticklabels(tuple(cnn_data['term'].tolist()), fontsize = 6, rotation = 45)

ax.legend((rects1[0], rects2[0]), ('CNN', 'Fox'), prop={'size':6})

fig.savefig('../Output/term_pct_div_ave.png', dpi = 250)