# Importing relevant libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import tensorflow as tf
import math
from scipy import special #comb, factorial
from keras import backend as K
from scipy.stats import uniform
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix, accuracy_score,f1_score




import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv('/kaggle/input/internet-articles-data-with-users-engagement/articles_data.csv')
df.head()

Getting basic info

In [None]:
df.info()

Let's see how many nulls we have

In [None]:
df.isnull().sum().sort_values(ascending=False)

We will remove columns that won't be used

In [None]:
cols_to_remove = ['Unnamed: 0', 
                  'source_id',
                  'author','url', 
                  'url_to_image',
                  'description',
                  'content'
                 ]

df.drop(cols_to_remove,axis=1,inplace=True)

In [None]:
df.isnull().sum().sort_values(ascending=False)

We further drop rows with nulls

In [None]:
df = df.dropna()
df.isnull().sum()

Now that we cleaned our dataset, we can begin exploring.

In [None]:
df.head()

How many media organizations our dataset took articles from?

In [None]:
df['source_name'].unique()

Let's look at the distribution

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x=df['source_name'],order=df['source_name'].value_counts().index)
plt.xticks(rotation=45)
plt.title('Count of articles per each newspaper')
plt.show()

# Let's look at the distribution of the time when the articles were published.

The earliest article in the dataset was published at:

In [None]:
df['published_at'].min()

The latest article in the dataset was published at:

In [None]:
df['published_at'].max()

In [None]:
plt.figure(figsize=(10,7))
df['published_at'] = pd.to_datetime(df['published_at'])
df['published_at'].hist()
plt.xticks(rotation=45)
plt.title('Distribution of time the articles were published at')
plt.show()

# Let's look at the continuous features.

In [None]:
cont_features = ['engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count']
df[cont_features].describe().round(2).T

We see that there are pretty extreme oultiers here (e.g., `engagement_reaction_count`'s 75% percentile is 43, but the max is 354132). Furthermore, we see that most values in `engagement_comment_plugin_count` are zero. Let's check out how many zero values there are.

In [None]:
zerov = df[df['engagement_comment_plugin_count'] == 0].shape[0]
nonzerov = df[df['engagement_comment_plugin_count'] != 0].shape[0]

print(f'Number of zero values in `engagement_comment_plugin_count`: {zerov}')
print(f'Number of non-zero values in `engagement_comment_plugin_count`: {nonzerov}')

We see that only $0.4\%$ values in `engagement_comment_plugin_count` are non-zero. Due to the extremely low variance, we will remove this column

In [None]:
df.drop(['engagement_comment_plugin_count'],axis=1,inplace=True)

Let's visualize the distributions of the remaining continuous features using histogram (to make graphs more readable, we will ignore all entries where values exceed 75th percentile)

In [None]:
cont_features = ['engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count']
WIDTH = 20
LENGTH = 7

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(1,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    ax[i].hist(df[df[feature] < df[feature].quantile(.75)][feature],alpha=0.6)
    ax[i].set_title(f'Distribution of a feature `{feature}`')

We see that even after we removed all entries with larger values, the tail still remains.

# Are continuous features correlated?

We would expect them to be, but let's check anyways.

In [None]:
cont_features = ['engagement_reaction_count', 
                 'engagement_comment_count',
                 'engagement_share_count']

df1 = df[cont_features]
corr=df1.corr()

plt.figure(figsize=(10,7))
sns.heatmap(corr,
            xticklabels=df1.columns,
            yticklabels=df1.columns,
           annot=True)
plt.title('Correlation matrix of the continuous features')
plt.show()

As expected, we have a very strong positive correlation between features.

# Which newspapers have the most shared articles?

We define "one of the most shared articles" as an article whose share count exceeds 75% percentile.

In [None]:
df1 = df[df['engagement_share_count'] > df['engagement_share_count'].quantile(.75)]

plt.figure(figsize=(10,8))
sns.countplot(x=df1['source_name'],order=df1['source_name'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Count of the most shared articles")
plt.show()

That's an interesting finding: If we consider ALL articles, CNN is on the 4th place; yet if we consider most shared articles, CNN comes on top.

# Which newspapers has the highest proportion of the most shared articles?

The proportion will be calculated as:

$$\frac{\text{Count of most shared articles published by }X}{\text{Count of all articles published by } X}$$

In [None]:
df1 = df[['source_name','engagement_share_count']].copy()
df1['> 0.75'] = df1['engagement_share_count'] > df1['engagement_share_count'].quantile(0.75)
df1 = df1.groupby(['source_name','> 0.75']).count()
df1['percent'] = df1.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
df1.reset_index(inplace=True)
df1 = df1[df1['> 0.75'] == True]



plt.figure(figsize=(12,8))
sns.barplot(x=df1['source_name'],
            y=df1['percent'],
            order=df1.sort_values(by='percent', ascending=False)['source_name'])
plt.xticks(rotation=45)
plt.title('Proportion of most shared articles')
plt.ylabel('Proportion (%)')
plt.show()

The result almost agrees with out previous graph (although there some changes (e.g., BBC moving downards and WS journal moving up to top 4))

# Which newspapers have the articles with the most user activity?

By "user activity" we mean the value in `engagement_reaction_count`

By "most" we mean that value must exceed 75th percentile.

In [None]:
df1 = df[df['engagement_reaction_count'] > df['engagement_reaction_count'].quantile(.75)]

plt.figure(figsize=(10,8))
sns.countplot(x=df1['source_name'],order=df1['source_name'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Count of the most reacted to articles")
plt.show()

The top 5 remains almost unchanged. Although it is worth noting that when we were considering newspapers with top share count, Reuters was in top 3, but when we consider `engagement_reaction_count`, Reuters drops to the very bottom.

# Are continuous variables correlated with the `source_name`?

We will use ANOVA to test independence between each continuous feature and `source_name` (which itself is a categorical variable listing all the publishers)

In [None]:
from scipy.stats import f_oneway

cont_features = ['engagement_reaction_count', 
                 'engagement_comment_count',
                 'engagement_share_count']

label = 'source_name'
dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

It seems that the `source_name` is indeed strongly correlated with all numerical variables, implying that numerical variables may have significant predictive power (if we are to try to predict `source_name`)

# What are the most frequent words in our articles? (all articles)

In [None]:
from wordcloud import WordCloud, STOPWORDS 


comment_words = '' 
stopwords = set(STOPWORDS) 
  
# iterate through the csv file 
for val in df['title']: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

Word cloud gives us a pretty good idea of what most articles are about: politics. 

Now let's take a look at the world cloud of the MOST SHARED articles

# What are the most frequent words in our articles? (top 25% most shared articles)

In [None]:
from wordcloud import WordCloud, STOPWORDS 


comment_words = '' 
stopwords = set(STOPWORDS) 
  
df1 = df[df['engagement_share_count'] > df['engagement_share_count'].quantile(0.75)].copy()
# iterate through the csv file 
for val in df1['title']: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

Pattern is almost unchanged. What is interesting to note though, words like "China", "Hong Kong", "Brexit" are more frequent in the top 25% articles (signified by the larger fontsize).

# Conclusions:

1. Most articles are about politics.
2. Distribution of the number of articles published by each newspaper is quite uneven. For example: the dataset contains over 1k articles published by Reuters, yet there are only $82$ articles published by ESPN.
2. Reuters, BBC news and ABC news have the biggest number of articles in the dataset. Yet the most shared articles are those of CNN, NY times and Reuters. Furthermore, the most reacted to articles are published by NY times, CNN and CBS news.
3. Distributions of the features `engagement_reaction_count`, `engagement_comment_count`,`engagement_share_count` have very long tails to the right, which implies that most articles have fairly low user activity (few comments, few shares etc.), but some articles are **very** popular (with views,shares exceeding tens of thousands).
4. `engagement_reaction_count`, `engagement_comment_count`,`engagement_share_count`  have strong positive correlation between each other, in other words: More comments implies more sharing, more shares implies more reactions (and vice versa).
5. Continuous features `engagement_reaction_count`, `engagement_comment_count`,`engagement_share_count` are not independent from the categorical variable `source_name` (which is just a variable listing the publishers of the articles). That means that if we are to try to predict the publisher of an article, the aforementioned numerical features may have significant predictive power.