# Importing libraries

In [None]:
import pandas as pd
import seaborn as sns
import dateutil.parser
import plotly.graph_objects as go

# Loading dataset

In [None]:
df = pd.read_csv("../input/internet-articles-data-with-users-engagement/articles_data.csv", index_col=0)
df.head()

# Dataset info

In [None]:
df.info()

In [None]:
df.describe()

### Top publishers

In [None]:
top_publishers = df["source_name"].value_counts().head(5)
top_publishers

 Selecting only top publishers for further anlysis

In [None]:
top_publishers_list = top_publishers.index.tolist()

df_sample = df[df["source_name"].isin(top_publishers_list)]

### Top articles based on reactions

In [None]:
def top_n(df, n, main_field, grouping_fields, sumi=False):        
    all_cols = grouping_fields+[main_field]
    titles_df = df.dropna(subset=all_cols)
    titles_df = titles_df[all_cols]
    if sumi:
        titles_df = titles_df.groupby(grouping_fields).sum()
    titles_df = titles_df.sort_values([main_field], ascending=False)
    titles_df.rename(columns={main_field: main_field+'_sum'}, inplace=True)
    return titles_df.head(n)

top_n(df_sample, 10, main_field='engagement_reaction_count', grouping_fields=['title', 'source_name'])

### Top articles based on shares

In [None]:
top_n(df_sample, 10, main_field='engagement_share_count', grouping_fields=['title', 'source_name'])

### Top authors based on `top_article` attribute

In [None]:
top_n(df_sample, 10, sumi=True, main_field='top_article', grouping_fields=['author'])

# Visualizing data

For a subset of publishers

### Engagement per publisher

In [None]:
def engagement_per_publisher(df):
    epp = []
    publishers = list(df['source_name'].unique())
    engagement_columns = ['engagement_reaction_count', 'engagement_comment_count', 'engagement_share_count']
    for ec in engagement_columns:
        df_eng = df.groupby(['source_name'])[ec].agg('sum')
        epp.append((ec, df_eng.tolist()))
    return publishers, epp


publishers, engagements_per_publisher = engagement_per_publisher(df_sample)

fig = go.Figure(go.Bar(x = publishers, y=engagements_per_publisher[0][1], name=engagements_per_publisher[0][0]))

for eng_name, eng_results in engagements_per_publisher[1:]: 
    fig.add_trace(go.Bar(x = publishers, y=eng_results , name=eng_name))

fig.update_layout(barmode='stack', xaxis={'categoryorder':'array'})
fig.show()

### Articles collected every day

In [None]:
def articles_per_publisher(df):
    df = df.dropna(subset=['source_name', 'published_at'])
    def parse_date(date_string):
        do = dateutil.parser.parse(date_string)
        return do.date().isoformat()
    df["published_at_day"] = df['published_at'].apply(lambda x : parse_date(x))
    df_date_pub = df.sort_values(['source_name', 'published_at_day'])
    dates_list = df_date_pub['published_at_day'].unique().tolist()
    
    df_date_pub = df.groupby(['source_name', 'published_at_day'])['published_at_day'].agg('count')
    list_date_pub = df_date_pub.tolist()
    p_len = len(dates_list)
    ldp_chunks = [list_date_pub[x:x+p_len] for x in range(0, len(list_date_pub), p_len)]
    return ldp_chunks, dates_list

ldp_chunks, dates_list = articles_per_publisher(df_sample)

fig = go.Figure(go.Bar(x = dates_list, y=ldp_chunks[0], name=top_publishers_list[0]))

for publisher, count in zip(top_publishers_list[1:], ldp_chunks[1:]): 
    fig.add_trace(go.Bar(x = dates_list, y=count , name=publisher))

fig.update_layout(barmode='stack', xaxis={'categoryorder':'array', 'categoryarray':dates_list})
fig.show()

### Heatmap

In [None]:
df_temp = df_sample \
    .groupby(['source_id'])['source_name','top_article', 'engagement_reaction_count', 'engagement_comment_count','engagement_share_count', 'engagement_comment_plugin_count']\
    .agg('sum')
    
# min-max normilization
df_temp = (df_temp-df_temp.min())/(df_temp.max()-df_temp.min())

ax = sns.heatmap(df_temp, cmap='RdYlGn_r', robust=True, annot_kws = {'size':14})
ax.tick_params(labelsize=14)
ax.figure.set_size_inches((5, 5))