In [1]:
import pickle
import pandas as pd
import json
from afinn import Afinn
import plotly.express as px

In [2]:
with open('pickled_df.pkl', 'rb') as f:
    news = pickle.load(f)
news = news.reset_index()
news.pop('index')
news = news[['author', 'site_name', 'title']]

In [3]:
news.head()

Unnamed: 0,author,site_name,title
0,"[""Liz Josaitis, Michigan State University Exte...",The Western Journal,"Health Scammers Prey on the Elderly, Here's Ho..."
1,"[""Aja Styles""]",Brisbane Times,'Pack Lego': Perth family caught in hard borde...
2,"[""Jake Johnson""]",Truthout,Congress Passes COVID Relief With Billions in ...
3,"[""Christine Favocci""]",The Western Journal,PA Man Facing Charges of Unlawful Voting After...
4,"[""Igor Derysh""]",Truthout,Fox News Forced to Debunk Its Own False Voting...


In [4]:
data = []
def convert_type(authors, site, title):
    try: 
        temp = [i for i in json.loads(authors) if i]
        if len(temp) > 1: 
            for author in temp[1:]:
                data.append([author, site, title])
        return temp[0] if temp else None
    except:
        return None

In [5]:
news['author'] = news.apply(lambda x: convert_type(x['author'], x['site_name'], x['title']), axis=1)

In [6]:
len(data)

4588

In [7]:
news = news.append(pd.DataFrame(data, columns=['author', 'site_name', 'title']), ignore_index=True)

In [8]:
news

Unnamed: 0,author,site_name,title
0,"Liz Josaitis, Michigan State University Extension",The Western Journal,"Health Scammers Prey on the Elderly, Here's Ho..."
1,Aja Styles,Brisbane Times,'Pack Lego': Perth family caught in hard borde...
2,Jake Johnson,Truthout,Congress Passes COVID Relief With Billions in ...
3,Christine Favocci,The Western Journal,PA Man Facing Charges of Unlawful Voting After...
4,Igor Derysh,Truthout,Fox News Forced to Debunk Its Own False Voting...
...,...,...,...
272293,Caitlin Mahy,WalesOnline,How to teach saving and spending to kids as yo...
272294,Victoria Jones,WalesOnline,How to teach saving and spending to kids as yo...
272295,Victoria Jones,WalesOnline,Space experiment could unlock resources for mi...
272296,Nisha Mal,WalesOnline,Woman's home is in Tier 2 while her garden fal...


In [9]:
news['article_count'] = news.groupby('author')['author'].transform('count')

In [10]:
news.author.value_counts()

Reuters                              11983
Reuters Staff                         7238
Associated                            5159
Vanguard                              3797
ABC News                              3287
                                     ...  
Pedro Fonseca, Rodrigo Viga Gaier        1
Sara Zeff Geber                          1
Martin Fletcher                          1
HAL CONTE, Lebanon Daily News            1
Mike Federle                             1
Name: author, Length: 32290, dtype: int64

In [11]:
news[news.author == 'Victoria Jones']

Unnamed: 0,author,site_name,title,article_count
3466,Victoria Jones,WalesOnline,Mum will give secondhand or homemade gifts wra...,92.0
11544,Victoria Jones,WalesOnline,Isolation could drive young people to extremis...,92.0
11548,Victoria Jones,WalesOnline,How to see Saturn and Jupiter align for first ...,92.0
12235,Victoria Jones,WalesOnline,Mum with incurable ovarian cancer prepares for...,92.0
14030,Victoria Jones,WalesOnline,The reason people get 'the fear' the morning a...,92.0
...,...,...,...,...
272113,Victoria Jones,WalesOnline,Young couple split up to spare her the pain of...,92.0
272146,Victoria Jones,WalesOnline,The full list of priority groups for the new C...,92.0
272294,Victoria Jones,WalesOnline,How to teach saving and spending to kids as yo...,92.0
272295,Victoria Jones,WalesOnline,Space experiment could unlock resources for mi...,92.0


In [12]:
news = news[news.article_count >= 10]

In [13]:
news

Unnamed: 0,author,site_name,title,article_count
1,Aja Styles,Brisbane Times,'Pack Lego': Perth family caught in hard borde...,14.0
2,Jake Johnson,Truthout,Congress Passes COVID Relief With Billions in ...,33.0
3,Christine Favocci,The Western Journal,PA Man Facing Charges of Unlawful Voting After...,19.0
8,William Rivers Pitt,Truthout,What Will Trump Attempt in His Last Days? We M...,14.0
9,Truthout,Truthout,Bernard J. Wolfson | Truthout,18.0
...,...,...,...,...
272292,Simon Bajkowski,Manchester Evening News,How Man City should line up vs Fulham in the P...,87.0
272294,Victoria Jones,WalesOnline,How to teach saving and spending to kids as yo...,92.0
272295,Victoria Jones,WalesOnline,Space experiment could unlock resources for mi...,92.0
272296,Nisha Mal,WalesOnline,Woman's home is in Tier 2 while her garden fal...,66.0


In [14]:
remove = {'', 'Reuters', 'Reuters Staff', 'Associated', 'Vanguard', 'ABC News', 
          'The Washington Times http://www.washingtontimes.com', 'BWW News Desk', 'Press Trust of India', 
          'Press Association', 'PTI', 'Capital Market', 'PA Media', 'The Korea Herald', 'SA Transcripts',
          'Deutsche Welle (www.dw.com)', 'Associated Press', 'Afp', 'India.com News Desk', 'The Associated Press',
          'Australian Associated', 'FE Online', 'www.ETAuto.com', 'www.ETTelecom.com', 'F_488', 'Staff Writer',
          'India.com Sports Desk', 'www.ETEnergyworld.com', 'IANS', 'TV News Desk', 'ANI Press Release', 'AP', 
          'www.ETHealthworld.com', 'WRAL', 'India.com Entertainment Desk', 'ANI', 'HT Auto Desk', 
          'WND News Services', 'Stage Tube', 'Staff report', 'The Economist', 'AFP', 'F_300926', 
          'www.ETBrandEquity.com', 'FE Bureau', 'The Christian Science Monitor', 'Flanders News', 'BS Web Team',
          'India.com Lifestyle Staff', 'VietNamNet News', 'India.com Lifestyle Staff', 'India.com Viral News Desk',
          'Dailymail.com', 'The Moscow Times', 'www.ETCIO.com', 'www.ETCIO.com', 'SI Reporter', 
          'From Staff Reports','BWW', 'Entrepreneur en Español', 'Trefis Team', 'azfamily.com News Staff', 
          'From staff reports', 'Compiled by Democrat-Gazette Staff From Wire Reports', 
          'Classifieds Arizona Daily Star', '', 'Daily Express newspaper', 'www.ETRealty.com', 'Daily', 'Livemint',
          'Kyodo', 'India.com Business Desk', 'The Editorial Board', 'F_300833', 'HNN Staff','F_126', 'F_127',
          'India.com Education Desk', 'F_300824', 'Staff', 'doctor.ndtv.com', 'Daily News Editorial Board',
          'Expert Panel®', 'Staff Report', 'Entrepreneur Store',  'BWW Staff', 'to the editor', 'Dawn.com', 
          'WND Staff', 'Arizona Daily Star',  'F_300832', 'TV Scoop', None, 'STLNEWS', 
          'India.com Hindi News Desk', 'news18', "The Newspaper's Staff Reporter", 
          'Business Standard Editorial Comment', 'Business Standard', 'THE ASSOCIATED PRESS',  'Associated Press staff',
          'THE EDITORIAL STAFF', 'The Value Portfolio', 'Breakingviews columnists', 'StatBot | STLhighschoolSPORTS.com',
          'Team YS', 'F_200745', 'F_300923', 'Special to The Commercial', 'Tarot Astrologers',
          'Voice of the People', 'TNW Deals', 'Guardian community team', 'Baltimore Sun staff', 'News18',
          'Mail', 'AP Reporters', 'Avisol Capital Partners', 'Agencies', 'News staff', 'CNBC.com staff',
          'BroadwayWorld TV', 'The Conversation', 'F_200798', ' birminghammail.co.uk', 'ASSOCIATED PRESS',
          'F_200805', "The Age's View", 'BOOX Research', 'Associated Press Staff', 'Fun Trading', 'The Gazette',
          'American Heart Association News', 'Record Reporter', 'The Investment Doctor', 'Roadshow staff',
          'the Times Union Editorial Board', 'Baltimore Sun Editorial Board', 'The Editors of\xa0GQ', 
          'The Roanoke Times', 'The Spectator', 'Editorial staff', 'Individual Trader', 'Univision', None,
          'Arkansas Democrat-Gazette', 'Mirror.co.uk', 'ASSOCIATED PRESS STAFF', 'DerWesten - derwesten.de',
          'The New York Times', 'Wolf Report', 'The Gazette editorial board', 'Real Vision', 
          'Sun Sentinel Editorial Board', 'Voice of the Mirror', '@adwait', 'Expert Panel', 'The Buy Area Staff',
          'Volatility Surfer', 'Special To The Arizona Daily Star', 'NFL Staff', 'CNN Staff',
          'PJ Brown Special to the Arizona Daily Star', 'Invesco US', 'Bloomberg News', 'Quora', 'CNN Wire',
          'IP Banking Research', 'Daile Cross', 'The Washington Post', 'Bears of Wall Street', 'Teen Vogue Staff',
          'ICICI Securities', 'WIRED Staff', 'This Is', 'Cinemanía', 'STAFF EDITORIAL', 'StackCommerce', 'Axios',
         'Lincoln Journal Star', 'The Sunday Times', 'IE Online', 'Bloomberg', 'Associated Press Reporter' ,'Behance',
         'The Financial Express', 'Firstpost', '20minutos', 'Daily Herald report', 'Staff reports', 'MarketWatch',
         'Times Staff', 'Trapping Value', 'Q.ai - Investing Reimagined', 'CNET staff', 'YEC', 'Catholic Online', 
         'Stone Fox Capital', 'PA Sport Staff', 'Entrepreneur Staff', 'Keystone-SDA/ts', 'Associated Press reporters',
         "The Herald's View", 'Autoblog Staff', 'finanzen.net GmbH', 'Rolling Stone', 'AP | PTI', 'Story by Reuters',
         'PA reporters', 'Sports Hotline', 'Pioneer Press Staff', 'PA', 'Ticked Off!', 'Gazette readers', 'ValueZen',
         'India Knight', 'RMC SPORT', 'Truthout', 'MailOnline', 'Variety Staff', 'News Service of Florida', 'From wire reports',
         'Elephant Analytics', 'Chowhound Staff', 'Quad 7 Capital', 'The Associated Press, Associated Press',
         'undefined undefined', 'Orlando Sentinel Editorial Board', 'Clarín.com', 'Staff and wire reports', 'IndiaToday.in',
         'East Kilbride News', 'Retirement Pot', 'QuandaryFX', 'Perthshire Advertiser', 'The Inquirer Editorial Board',
         'CondÃ© Nast', 'WRITE TO LETTERS@THETIMES.CO.UK', 'Entrepreneur Middle East Staff', 'Editorial', 'WWD Staff',
         'STATE JOURNAL WIRE SERVICES', 'Think Change India', 'Shock Exchange', 'News4Jax staff', 'CNN staff',
         'The A.V. Club', 'Journal Star editorial board', 'Record View', 'Renaissance Capital IPO Research'}
# atleast 10 articles

In [15]:
for i in range(len(news)):
    if not ''.join(filter(lambda character:ord(character) < 0x3000, news.iloc[i].author)):
        remove.add(news.iloc[i].author)

In [17]:
news = news[~news['author'].isin(remove)]

In [20]:
news

Unnamed: 0,author,site_name,title,article_count
1,Aja Styles,Brisbane Times,'Pack Lego': Perth family caught in hard borde...,14.0
2,Jake Johnson,Truthout,Congress Passes COVID Relief With Billions in ...,33.0
3,Christine Favocci,The Western Journal,PA Man Facing Charges of Unlawful Voting After...,19.0
8,William Rivers Pitt,Truthout,What Will Trump Attempt in His Last Days? We M...,14.0
11,Amy Goodman,Truthout,The Insufficient COVID Stimulus Must Not Be Fo...,19.0
...,...,...,...,...
272292,Simon Bajkowski,Manchester Evening News,How Man City should line up vs Fulham in the P...,87.0
272294,Victoria Jones,WalesOnline,How to teach saving and spending to kids as yo...,92.0
272295,Victoria Jones,WalesOnline,Space experiment could unlock resources for mi...,92.0
272296,Nisha Mal,WalesOnline,Woman's home is in Tier 2 while her garden fal...,66.0


In [21]:
afinn = Afinn(language='en')
def get_title_sentiment(text):
    return round(afinn.score(text) / len(text.split()) * 100, 2)

In [27]:
news['title_sentiment'] = news.apply(lambda x: get_title_sentiment(x['title']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
news.title_sentiment.describe()

count    121131.000000
mean         -0.787697
std          17.202567
min        -160.000000
25%          -9.090000
50%           0.000000
75%           6.670000
max         150.000000
Name: title_sentiment, dtype: float64

In [26]:
news.title_sentiment.value_counts()

 0.00      46997
-16.67      1525
-20.00      1519
-14.29      1501
 16.67      1454
           ...  
-24.24         1
 55.00         1
-160.00        1
-125.00        1
-2.94          1
Name: title_sentiment, Length: 411, dtype: int64

In [31]:
def get_top_journalists(source):
    temp = news[news.site_name == source]
    temp = temp.sort_values(by = ['article_count'], ascending = False)
    temp['article_count'] = temp.groupby('author')['author'].transform('count')
    temp = temp.author.value_counts().reset_index()
    temp = temp.rename(columns = {"author": "articles"})
    temp = temp.rename(columns = {"index": "author"})
    return temp

In [35]:
fig = px.bar(get_top_journalists('The Advocate'), x = 'author', y = 'articles')
fig.show()

In [36]:
def source_sentiments(source):
    temp = news[news.site_name == source]
    temp = temp.sort_values(by = ['title_sentiment'], ascending = False)
    temp = temp.groupby('author')['title_sentiment'].mean()
    temp = temp.reset_index()
    temp = temp.sort_values(by = ['title_sentiment'])
    return temp

In [38]:
fig = px.bar(source_sentiments('The Advocate'), x = 'author', y = 'title_sentiment')
fig.show()