<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

* Let’s import the modules that we will need, and initialize our environment:

In [221]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

%matplotlib inline

In [222]:
df = pd.read_csv('medium_posts.csv.zip', sep='\t')

In [223]:
df = df[['published', 'url']].dropna().drop_duplicates()

In [224]:
df.shape

(92180, 2)

Next, we need to convert published to the datetime format because by default pandas treats this field as string-valued.

In [225]:
df['published'] = pd.to_datetime(df['published'], utc = True)

In [226]:
df.sort_values(by=['published']).head(n=3)

Unnamed: 0,published,url
50931,1970-01-01 00:00:00.001000+00:00,https://medium.com/iiot
40243,1970-01-01 00:00:00.001000+00:00,https://medium.com/@ikaella/melon-rebranding-b...
37395,1970-01-18 05:11:46.500000+00:00,http://www.novosti.rs/%D0%B2%D0%B5%D1%81%D1%82...


In [227]:
df = df[(df['published'] > '2012-08-15') & (df['published'] < '2017-06-26')].sort_values(by=['published'])
df.head(n=3)

Unnamed: 0,published,url
24630,2012-08-15 00:25:03.373000+00:00,https://medium.com/launch-day/jean-attempts-to...
24631,2012-08-15 00:25:29.419000+00:00,https://medium.com/launch-day/dan-and-kristin-...
17811,2012-08-15 00:34:59.502000+00:00,https://medium.com/i-m-h-o/the-world-is-social...


In [228]:
df.tail(n=3)

Unnamed: 0,published,url
62122,2017-06-25 23:36:01.171000+00:00,https://medium.com/push-the-pace/the-official-...
72471,2017-06-25 23:41:48.295000+00:00,https://medium.com/parti-xyz-developers/%EA%B4...
83283,2017-06-25 23:51:43+00:00,http://www.johanr.com/blog/people-support-dreams


In [229]:
aggr_df = df.groupby('published')[['url']].count()
aggr_df.columns = ['posts']
aggr_df.head(n=3)

Unnamed: 0_level_0,posts
published,Unnamed: 1_level_1
2012-08-15 00:25:03.373000+00:00,1
2012-08-15 00:25:29.419000+00:00,1
2012-08-15 00:34:59.502000+00:00,1


In [230]:
daily_df = aggr_df.resample('D').apply(sum)
daily_df.head(n=3)

Unnamed: 0_level_0,posts
published,Unnamed: 1_level_1
2012-08-15 00:00:00+00:00,16
2012-08-16 00:00:00+00:00,11
2012-08-17 00:00:00+00:00,4


In [231]:
from plotly.offline import init_notebook_mode, iplot
from plotly import graph_objs as go

# Initialize plotly
init_notebook_mode(connected=True)
def plotly_df(df, title=''):
    """Visualize all the dataframe columns as line plots."""
    common_kw = dict(x=df.index, mode='lines')
    data = [go.Scatter(y=df[c], name=c, **common_kw) for c in df.columns]
    layout = dict(title=title)
    fig = dict(data=data, layout=layout)
    iplot(fig, show_link=False)

In [232]:
plotly_df(daily_df, title='Posts on Medium (daily)')

In [233]:
weekly_df = daily_df.resample('W').apply(sum)

In [234]:
plotly_df(weekly_df, title='Posts on Medium (weekly)')

In [235]:
daily_df = daily_df.loc[daily_df.index >= '2015-01-01']
daily_df.head(n=3)

Unnamed: 0_level_0,posts
published,Unnamed: 1_level_1
2015-01-01 00:00:00+00:00,8
2015-01-02 00:00:00+00:00,11
2015-01-03 00:00:00+00:00,11


In [236]:
from fbprophet import Prophet

import logging
logging.getLogger().setLevel(logging.ERROR)

In [237]:
df = daily_df.reset_index()
df.columns = ['ds', 'y']
df.tail(n=3)

Unnamed: 0,ds,y
904,2017-06-23 00:00:00+00:00,421
905,2017-06-24 00:00:00+00:00,277
906,2017-06-25 00:00:00+00:00,253


In [248]:
prediction_size = 30
train_df = df[:-prediction_size]
train_df.tail(n=3)
train_df['ds'] = pd.to_datetime(train_df['ds'], utc = True)

In [247]:
m = Prophet()
m.fit(train_df)

ValueError: Column ds has timezone specified, which is not supported. Remove timezone.

In [246]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 877 entries, 0 to 876
Data columns (total 2 columns):
ds    877 non-null datetime64[ns, UTC]
y     877 non-null int64
dtypes: datetime64[ns, UTC](1), int64(1)
memory usage: 13.8 KB
