In [55]:
import pandas as pd
from glob import glob
import seaborn as sns
from bokeh.io import curdoc
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, RangeTool, NumeralTickFormatter, Range1d, LabelSet, HoverTool
from bokeh.themes import Theme
from bokeh.embed import components
from bokeh.transform import cumsum
from bokeh.layouts import layout
from bokeh.models.widgets import Tabs, Panel
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

ModuleNotFoundError: No module named 'wordcloud'

In [24]:
# Styles for Bokeh Plots
THEME = Theme(json={
    'attrs': {
        'Figure': {
            'sizing_mode': 'stretch_width',
            'plot_height': 500,
            'background_fill_color': '#FFFFFF',
            'border_fill_color': '#FFFFFF',
            'outline_line_color': '#FFFFFF',
        },
        'Axis': {
            'major_label_text_font_size': '12pt',
            'minor_tick_line_color': "white",
            'axis_label_text_font_size': '14pt',
            'axis_label_text_font_style': 'bold',
            'axis_label_text_font_style': 'italic'
        },
        'Grid': {
            'grid_line_color': None
        },
        'Line': {
            'line_color': '#000000',
            'line_width': 2,
        },
        'Bar': {
            'fill_color': '#000000'
        },
        'Title': {
            'text_color': "#000000",
            'text_font_size': '16pt',
            'text_font_style': 'bold'
        }
    }
})
doc = curdoc()
doc.theme = THEME

# Colors for charts
PRIMARY_COLOR = '#000000'
SECONDARY_COLOR = '#000000'
GRAY = '#A9A9A9'
COLORS = ['#4101f5', '#21e68c', '#b22c99',
          '#8c1932', '#509bf5', '#fae62c', '#f59b22', '#f5729f', '#ff4935']
output_notebook()

# Combine Datasets

In [8]:
df = pd.DataFrame()
for csv in glob('data/*.csv'):
    this_df = pd.read_csv(csv)
    df = df.append(this_df)

df.set_index('date', inplace=True)
df.to_csv('combined_data.csv')

In [9]:
combined_df = pd.read_csv('combined_data.csv')
combined_df

Unnamed: 0,date,published_at,source_id,source_name,author,content,title,url,url_image
0,2021-08-02,2021-08-02T20:10:00,reuters,Reuters,Gertrude Chavez-dreyfuss,A representation of the virtual cryptocurrency...,Crypto sector sees outflows for 4th week in a ...,https://www.reuters.com/business/finance/crypt...,https://www.reuters.com/resizer/lmbIm3yyzD-3za...
1,2021-08-02,2021-08-02T15:56:45,the-verge,The Verge,Makena Kelly,It includes billions for broadband expansion\r...,Senate negotiators release bipartisan infrastr...,https://www.theverge.com/2021/8/2/22605917/sen...,https://cdn.vox-cdn.com/thumbor/-Ptu6iB68qfRKY...
2,2021-08-02,2021-08-02T10:53:05,vice-news,Vice News,"Rimal Farrukh, Sahar Habib Ghazi","After a fight with his best friend, an embitte...",The Viral Friendship Break Up Meme Just Sold F...,https://www.vice.com/en/article/m7ek5v/viral-f...,https://video-images.vice.com/articles/6107c17...
3,2021-08-02,2021-08-02T10:53:05,vice-news,Vice News,"Rimal Farrukh, Sahar Habib Ghazi","After a fight with his best friend, an embitte...",The Viral Friendship Break Up Meme Just Sold F...,https://www.vice.com/en/article/m7ek5v/viral-f...,https://video-images.vice.com/articles/6107c17...
4,2021-08-02,2021-08-02T04:16:41,,HYPEBEAST,"info@hypebeast.com (HYPEBEAST), HYPEBEAST",CryptoPunks have seen a resurgence once again ...,CryptoPunk Prices See 53% Increase After Ether...,https://hypebeast.com/2021/8/cryptopunk-price-...,https://image-cdn.hypb.st/https%3A%2F%2Fhypebe...
...,...,...,...,...,...,...,...,...,...
796,2021-08-08,2021-08-08T13:56:47,associated-press,Associated Press,Lisa Mascaro,WASHINGTON (AP) Senators will resume a weekend...,Senate slog to pass infrastructure bill goes o...,https://apnews.com/article/congress-senate-inf...,https://storage.googleapis.com/afs-prod/media/...
797,2021-08-08,2021-08-08T22:00:18,,Survivalblog.com,James Wesley Rawles,Here are the latest news items and commentary ...,Economics & Investing For Preppers,https://survivalblog.com/2021/08/09/economics-...,https://survivalblog.com/wp-content/uploads/20...
798,2021-08-08,2021-08-08T22:00:38,,Bitcoinist,Jacob Holiday,"Brian Brooks, CEO resigned as CEO of Binance.U...",Binance US CEO Brian Brooks Resigns After 4 Mo...,https://bitcoinist.com/binance-us-ceo-brian-br...,https://bitcoinist.com/wp-content/uploads/2021...
799,2021-08-08,2021-08-08T04:16:10,associated-press,Associated Press,Lisa Mascaro,WASHINGTON (AP) Senators will resume a weekend...,Senate slog to pass infrastructure bill goes o...,https://apnews.com/article/joe-biden-business-...,https://storage.googleapis.com/afs-prod/media/...


# Posts by Publication

In [26]:
by_publication = combined_df.groupby('source_name')[['url']].nunique().reset_index().sort_values(by='url').tail(20)
by_publication.columns = ['source', 'articles']

In [33]:
source = ColumnDataSource(by_publication)
p = figure(y_range = by_publication['source'], toolbar_location=None)
p.hbar(y='source', right='articles', source=source, color='#000000', height=.5)
p.x_range = Range1d(0, by_publication['articles'].max()*1.05)
show(p)

# By Author

In [36]:
by_author = combined_df.groupby('author')[['url']].nunique().reset_index().sort_values(by='url').tail(20)
by_author.columns = ['author', 'articles']
by_author['truncated'] = by_author['author'].apply(lambda x: x[0:45])

In [37]:
source = ColumnDataSource(by_author)
p = figure(y_range = by_author['truncated'], toolbar_location=None)
p.hbar(y='truncated', right='articles', source=source, color='#000000', height=.5)
p.x_range = Range1d(0, by_author['articles'].max()*1.05)
show(p)

# Word Counts

In [38]:
combined_df.head()

Unnamed: 0,date,published_at,source_id,source_name,author,content,title,url,url_image
0,2021-08-02,2021-08-02T20:10:00,reuters,Reuters,Gertrude Chavez-dreyfuss,A representation of the virtual cryptocurrency...,Crypto sector sees outflows for 4th week in a ...,https://www.reuters.com/business/finance/crypt...,https://www.reuters.com/resizer/lmbIm3yyzD-3za...
1,2021-08-02,2021-08-02T15:56:45,the-verge,The Verge,Makena Kelly,It includes billions for broadband expansion\r...,Senate negotiators release bipartisan infrastr...,https://www.theverge.com/2021/8/2/22605917/sen...,https://cdn.vox-cdn.com/thumbor/-Ptu6iB68qfRKY...
2,2021-08-02,2021-08-02T10:53:05,vice-news,Vice News,"Rimal Farrukh, Sahar Habib Ghazi","After a fight with his best friend, an embitte...",The Viral Friendship Break Up Meme Just Sold F...,https://www.vice.com/en/article/m7ek5v/viral-f...,https://video-images.vice.com/articles/6107c17...
3,2021-08-02,2021-08-02T10:53:05,vice-news,Vice News,"Rimal Farrukh, Sahar Habib Ghazi","After a fight with his best friend, an embitte...",The Viral Friendship Break Up Meme Just Sold F...,https://www.vice.com/en/article/m7ek5v/viral-f...,https://video-images.vice.com/articles/6107c17...
4,2021-08-02,2021-08-02T04:16:41,,HYPEBEAST,"info@hypebeast.com (HYPEBEAST), HYPEBEAST",CryptoPunks have seen a resurgence once again ...,CryptoPunk Prices See 53% Increase After Ether...,https://hypebeast.com/2021/8/cryptopunk-price-...,https://image-cdn.hypb.st/https%3A%2F%2Fhypebe...


In [53]:
no_nulls = combined_df.dropna(subset=['title', 'content'])
no_nulls['text_concat'] = no_nulls['title'] + ' ' + no_nulls['content']
full_text = ', '.join(no_nulls['text_concat'].tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [54]:
full_text

