In [1]:
## Summarizing Data by Authors


import pandas as pd
data=pd.read_csv('main.csv')
# Checking for missing values and summarizing the data by authors
import plotly.express as px



In [4]:
# Checking for missing values in the DataFrame
missing_values = data.isnull().sum()
print(missing_values)  


Unnamed: 0         0
DATE               0
AUTHOR             0
TITLE              0
cleaned            0
sentiment          0
sentiment_score    0
emotion            0
emotion_score      0
ARTICLE            0
dtype: int64


In [3]:
print(data.head()) 

   Unnamed: 0        DATE             AUTHOR  \
0      205074  2019-06-29  Field Level Media   
1      205655  2019-06-11  Field Level Media   
2      205885  2019-06-15  Field Level Media   
3      206081  2019-06-15  Field Level Media   
4      206131  2019-06-19  Field Level Media   

                                              TITLE  \
0               Sanchez, Nationals shut down Tigers   
1  Rays top A's as Morton's unbeaten streak hits 21   
2        Jimenez, White Sox crush Sabathia, Yankees   
3           Giants crack three homers, down Brewers   
4          A's crack six homers, obliterate Orioles   

                                        cleaned sentiment  sentiment_score  \
0                Sanchez , Nationals shut tiger   neutral         0.892794   
1             ray Morton unbeaten streak hit 21   neutral         0.626775   
2  Jimenez , White Sox crush Sabathia , Yankees   neutral         0.899590   
3                    giant crack homer , brewer   neutral         0.

Distribution of Publications by Author

In [4]:

missing_values = data.isnull().sum()
author_distribution = data['AUTHOR'].value_counts()

author_distribution
fig = px.bar(author_distribution, 
             x=author_distribution.index, 
             y=author_distribution.values, 
             labels={'x':'Author', 'y':'Count'},
             title='Distribution of Publications by Author')

fig.show()

Analyzing Temporal Trends in Article Publications

In [5]:
## Analyzing Temporal Trends in Article Publications


import plotly.express as px
# Convert the 'DATE' column to datetime format
data['DATE'] = pd.to_datetime(data['DATE'])

# Group by Date and count the number of articles
date_counts = data.groupby('DATE').size().reset_index(name='counts')

# Plotting using Plotly
fig = px.line(date_counts, x='DATE', y='counts', title='Temporal Trends of Articles', labels={'counts': 'Number of Articles'})
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Articles')
fig.show()



In [None]:
print(data)

Monthly Publication Trends by Author


In [6]:
## Monthly Publication Trends by Author

# Group by both Date and Author
import pandas as pd
import plotly.graph_objects as go

# Load the dataset and prepare it
data = pd.read_csv('main.csv')
data['DATE'] = pd.to_datetime(data['DATE'])

# Resample data to monthly frequency
data['MONTH'] = data['DATE'].dt.to_period('M')

# Group by both Month and Author
author_month_counts = data.groupby(['MONTH', 'AUTHOR']).size().unstack(fill_value=0)

# Create the figure using Plotly
fig = go.Figure()
for author in author_month_counts.columns:
    fig.add_trace(go.Bar(x=author_month_counts.index.astype(str), y=author_month_counts[author], name=author))

fig.update_layout(
    barmode='stack',
    title='Number of Articles by Author (Monthly)',
    xaxis_title='Month',
    yaxis_title='Number of Articles',
    xaxis={'type': 'category'},  # Change this to 'category' to handle the period index as categorical
    legend_title='Author',
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
fig.show()

