In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# setting some matplotlib defaults to make plots a bit nicer
import matplotlib
matplotlib.rcParams['font.family'] = "sans-serif"
matplotlib.rcParams['font.weight'] = "light"
matplotlib.rcParams['font.style'] = "normal"
matplotlib.rcParams['ytick.color'] = "#434343"
matplotlib.rcParams['xtick.color'] = "#434343"
matplotlib.rcParams['text.color'] = "#434343"
matplotlib.rcParams['axes.spines.left'] = False
matplotlib.rcParams['axes.spines.bottom'] = False
matplotlib.rcParams['axes.spines.top'] = False
matplotlib.rcParams['axes.spines.right'] = False
matplotlib.rcParams['axes.titleweight'] = 'semibold'
matplotlib.rcParams['axes.axisbelow'] = True
matplotlib.rcParams['xtick.bottom'] = False
matplotlib.rcParams['ytick.left'] = False
matplotlib.rcParams['grid.color'] = '#B5B6B8'
matplotlib.rcParams['grid.alpha'] = 0.5
matplotlib.rcParams['savefig.dpi'] = 300

palette = ['#A45FD6', '#B5B6B8', '#ECECED' ]

In [None]:
df = pd.read_csv(os.path.join(dirname, 'articles_data.csv'))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# who puts out articles
source_dist = (df.groupby('source_name').agg(percentage=('source_id', 'count'))/df.source_id.count()).sort_values('percentage')
top = source_dist.iloc[-1].name
color = [{top:palette[0]}.get(x, palette[1]) for x in source_dist.index]
source_dist.percentage.plot(
    kind = 'barh',
    color=color
)


In [None]:
# Lets have a look at the engagement metrics and see what they all are.
engagement_cols = [col for col in df.columns if 'engagement' in col]

In [None]:
# lets drop the rows that are missing engagement metrics.
for col in engagement_cols:
    df = df[df[col].notnull()]


In [None]:
engagement_summary = df[engagement_cols].describe().T
engagement_summary

In [None]:
# lets look at the distribution of the engagement kpis (trimming off the upper 25% to remove the extream ones.)
for col in engagement_cols:
    f,ax = plt.subplots()
    df[df[col]<engagement_summary.loc[col, '75%']][col].hist(color=palette[0])
    ax.set_title(col.replace('_',' '), x=0, ha='left')


# Investigating the correlations betweeen the engagement metrics.

In [None]:
correlations = (
    df[engagement_cols]
    # compute the correlations coeffes
    .corr(method='spearman')
    # convert the data into a long format so we can order the correlations
    .reset_index()
    .melt(
        id_vars = 'index'
    )
    # drop the duplicates
    .query('index > variable')
    .sort_values('value', ascending=False)
    .rename(columns = {'index':'col1', 'variable': 'col2'})
    .reset_index(drop=True)
)
correlations

The plugin metric has the weakest corelation with the other engagment metrics. The strongest correlation is between the comment and reaction counts.

In [None]:
x_col = correlations.loc[0, 'col1']
y_col = correlations.loc[0, 'col2']
ax = df.plot(
    x=x_col,
    y=y_col,
    kind='scatter',
    color = palette[0]
)
# ax.set_xlim(0, engagement_summary.loc[x_col, '75%'])
# ax.set_ylim(0, engagement_summary.loc[y_col, '75%'])


Reaction count is a reasonable predictor for comment count.

# What are the common features of the articles with high reaction counts?

In [None]:
df_top_reactions = df[df['engagement_reaction_count'] > engagement_summary.loc['engagement_reaction_count','75%']]
df_top_reactions = (
    df_top_reactions
    .groupby(['source_name'])
    .agg(percentage_in_top_25=('source_id', 'count'))
    .sort_values('percentage_in_top_25', ascending=False) / df_top_reactions.source_id.count()
)

df_top_reactions = df_top_reactions.join(source_dist).assign(
    difference_in_dist = lambda x: x['percentage_in_top_25'] - x['percentage']
).sort_values('difference_in_dist')

ax = df_top_reactions.difference_in_dist.plot(
    kind = 'barh',
    color = ['red' if x<0 else 'green' for x in df_top_reactions.difference_in_dist],
    width=0.85
)
ax.set_title('Difference between source distribution \nin top 25% and the general population', x=0, ha='left')

In this sample The New York Times has a much higher percentage of articles in the top 25% of reactions than it's percentage of articles overall, indicating that the New York times has a much higher quality of tracffic. Reuters and the Irish Times have a much lower percentage of articles in the top 25% of reactions.