# Table of Contents
* [Import, Overview and Data Preparations](#1)
* [Explore Categorical Features](#2)
* [Explore Numerical Features](#3)
* [Explore Time Features](#4)
* [Wordcloud of Titles](#5)
* [Top 10](#6)
* [Filter Articles by Keyword](#7)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plot
import matplotlib.pyplot as plt

# wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

<a id='1'></a>
# Import, Overview and Data Preparations

In [None]:
# load data
df = pd.read_csv('../input/towards-data-science-articles-dataset-20102021/tds_data.csv')
df.head(10)

In [None]:
n_rows = df.shape[0]
print('Number of rows:', n_rows)

In [None]:
df.info()

In [None]:
# date handling
df['publish_date']= pd.to_datetime(df['publish_date'])
# extract month and year and add to data frame
df['Year'] = df.publish_date.dt.year
df['Month'] = df.publish_date.dt.month
# combined year/month
df['Year_Month'] = df['Year'].astype(str) + '_' + df['Month'].astype(str)

<a id='2'></a>
# Explore Categorical Features

In [None]:
# authors
plt.figure(figsize=(12,6))
df.author.value_counts()[0:25].plot(kind='bar')
plt.title('Authors - Top 25')
plt.grid()
plt.show()

In [None]:
# paid
plt.figure(figsize=(8,6))
df.paid.value_counts().plot(kind='bar')
plt.title('Paid')
plt.grid()
plt.show()

<a id='3'></a>
# Explore Numerical Features

In [None]:
# define numerical features
features_num = ['claps', 'responses', 'reading_time']

In [None]:
# basic stats of numerical features
df[features_num].describe()

### Distributions:

In [None]:
# plot numerical features
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))
    
    ax1.hist(df[f], bins=50)
    ax1.grid()
    ax1.set_title(f)
    
    ax2.boxplot(df[f], vert=False)
    ax2.grid()   
    ax2.set_title(f)
    
    plt.show()

### Logarithmic Plots:

In [None]:
# responses - log plot for non-zeroes
tmp_vals = df.responses[df.responses>0]
perc_0 = ((df.responses[df.responses==0]).shape[0]) / n_rows 
perc_0 = np.round(100*perc_0, 2)
plt.figure(figsize=(8,6))
plt.hist(np.log10(tmp_vals), bins=100)
plt.title('log10(Responses|Responses>0),  %(Response=0): ' + str(perc_0) + '%')
plt.grid()
plt.show()

In [None]:
# claps - log plot for non-zeroes
tmp_vals = df.claps[df.claps>0]
perc_0 = ((df.claps[df.claps==0]).shape[0]) / n_rows 
perc_0 = np.round(100*perc_0, 2)
plt.figure(figsize=(8,6))
plt.hist(np.log10(tmp_vals), bins=100)
plt.title('log10(Claps|Claps>0),  %(Claps=0): ' + str(perc_0) + '%')
plt.grid()
plt.show()

### Scatter Plots:

In [None]:
# scatter plot
plt.figure(figsize=(8,6))
plt.scatter(df.responses, df.claps, alpha=0.25)
plt.title('Claps vs Responses')
plt.xlabel('Responses')
plt.ylabel('Claps')
plt.grid()
plt.show()

<a id='4'></a>
# Explore Time Features

In [None]:
# year
df.Year.value_counts().sort_index().plot(kind='bar')
plt.grid()
plt.title('Articles by year')
plt.show()

In [None]:
# month
df.Month.value_counts().sort_index().plot(kind='bar')
plt.grid()
plt.title('Articles by month')
plt.show()

In [None]:
# combined year+month
plt.figure(figsize=(14,5))
df.Year_Month.value_counts().sort_index().plot(kind='bar')
plt.grid()
plt.title('Articles by year/month')
plt.show()

<a id='5'></a>
# Wordcloud of Titles

In [None]:
text = " ".join(txt for txt in df.title)
stopwords = set(STOPWORDS)

In [None]:
# show wordcloud
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(14,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

<a id='6'></a>
# Top 10

### Most Claps

In [None]:
df_tmp = df.nlargest(10, 'claps')
df_tmp[['title','author','claps']]

### Most Responses

In [None]:
df_tmp = df.nlargest(10, 'responses')
df_tmp[['title','author','responses']]

### Highest Reading Time

In [None]:
# the longest articles
df_tmp = df.nlargest(10, 'reading_time')
df_tmp[['title','author','reading_time']]

<a id='7'></a>
# Filter Articles by Keyword

In [None]:
# define keyword for filter
keyword = 'quantum'

In [None]:
df_filter = df[df.title.str.contains(keyword, case=False)]
print('Number of hits:', df_filter.shape[0])

In [None]:
# preview
df_filter

In [None]:
# show all titles
result_list = df_filter.title.tolist()

# clean up a little bit before printing
result_list = [element.replace('\xa0', ' ') for element in result_list]
result_list = [element.replace('\u200a', ' ') for element in result_list]
result_list

In [None]:
# wordcloud for filtered articles
text = " ".join(txt for txt in result_list)
stopwords = set(STOPWORDS)

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=200,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(8,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# export to file for further processing
df_filter.to_csv('df_filter.csv')