In [1]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import collections
import itertools
import numpy as np
import datetime
import json
from dateutil import parser

tqdm.pandas()

In [2]:
def fix_date(date):
    json_acceptable_string = date.replace("'", "\"")
    d = json.loads(json_acceptable_string)
    return(parser.parse(d['$date']))

In [3]:
def person_in_headline(headline, persons):
    if headline:
        headline_dict = {}
        for i in persons:
            if i in headline:
                if i in headline_dict:
                    headline_dict[i] += 1
                else:
                    headline_dict[i] = 1
        return headline_dict
    else:
        return {}

In [4]:
def person_in_text(text, persons):
    text_dict = {}
    for person in persons:
        text_dict[person] = text.count(person)
    return text_dict

## Dhaka Tribune

In [3]:
df = pd.read_pickle('Data/Processed Data/Networks/DT-network.pkl')

In [4]:
df['entity0'] = df['entity0'].apply(lambda x: "Sheikh Hasina" if x == "Sheikh Hasina," else x)
df['entity1'] = df['entity1'].apply(lambda x: "Sheikh Hasina" if x == "Sheikh Hasina," else x)

In [5]:
G = nx.Graph()
network_type = 'person'
count = 0
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    if row['type0'] in network_type and row['type1'] in network_type:
        # print(row['entity0'], row['entity1'])
        count += 1
        G.add_edge(row['entity0'], row['entity1'])

print(count)

100%|██████████| 247369/247369 [00:16<00:00, 15179.76it/s]

24925





In [6]:
print(len(G.nodes))
print(len(G.edges))
print(nx.density(G))

10132
16867
0.00032863994112966957


In [7]:
degree_centrality_G = nx.degree_centrality(G)
sorted(degree_centrality_G.items(), key = lambda x:x[1], reverse= True)[:20]

[('Sheikh Hasina', 0.042740104629355444),
 ('Khaleda Zia', 0.03405389398874741),
 ('Ershad', 0.017471128220313888),
 ('Hossain', 0.011252591057151318),
 ('Rahman', 0.010759056361662226),
 ('Monirul', 0.010166814727075313),
 ('Tarique Rahman', 0.009475866153390584),
 ('Khokon', 0.009475866153390584),
 ('Nur Hossain', 0.009278452275194946),
 ('Babul', 0.008192675945118941),
 ('Ziaur Rahman', 0.007896555127825486),
 ('Kamal', 0.007797848188727668),
 ('Shamim', 0.007008192675945119),
 ('Latif', 0.007008192675945119),
 ('Mizanur Rahman', 0.0069094857368473),
 ('Masud', 0.0069094857368473),
 ('Hannan', 0.006613364919553845),
 ('Bangabandhu Sheikh Mujibur Rahman', 0.006514657980456026),
 ('Mamun', 0.006218537163162571),
 ('Kabir', 0.0060211232849669335)]

In [22]:
pr_G = nx.pagerank(G)
sorted(pr_G.items(), key = lambda x:x[1], reverse= True)[:20]

[('Sheikh Hasina', 0.00869905366606153),
 ('Khaleda Zia', 0.006527888517198587),
 ('Ershad', 0.0033630388467075292),
 ('Hossain', 0.002355549827711224),
 ('Rahman', 0.002235240033979961),
 ('Monirul', 0.002080998864662062),
 ('Khokon', 0.0020030257658740025),
 ('Nur Hossain', 0.001861842417190834),
 ('Babul', 0.0018555560844793634),
 ('Tarique Rahman', 0.0017629092408332318),
 ('Ziaur Rahman', 0.0015527961442470318),
 ('Kamal', 0.001528006434979456),
 ('Mizanur Rahman', 0.0014902985115982292),
 ('Hannan', 0.0013829568272596573),
 ('Masud', 0.0013667093545102417),
 ('Shamim', 0.0013531028789846643),
 ('Mamun', 0.001333563126841153),
 ('Latif', 0.0013118195046079358),
 ('Kabir', 0.0012747360163115657),
 ('Bangabandhu Sheikh Mujibur Rahman', 0.0012723706363395382)]

In [8]:
G.remove_edges_from(G.selfloop_edges())
sorted(nx.core_number(G).items(), key= lambda x:x[1], reverse=True)[:20]

[('Hasanul Haque', 9),
 ('Mahmudul Haque Munshi Gonojagoron Moncho', 9),
 ('Sangeeta Imam', 9),
 ('Sarker', 9),
 ('Muhammad Zafar Iqbal', 9),
 ('Ramendu Majumdar', 9),
 ('Syed Anwar Hossain', 9),
 ('Arefin Siddique', 9),
 ('Shahriar Kabir', 9),
 ('Nasiruddin Yusuf', 9),
 ('Kamal Hossain', 8),
 ('Tarique Rahman', 8),
 ('Shahidul', 8),
 ('Nasim', 8),
 ('Hossain', 8),
 ('Ziaur Rahman', 8),
 ('Matia Chowdhury', 8),
 ('Sajeeb Wazed Joy', 8),
 ('Nazrul Islam', 8),
 ('Shamim', 8)]

In [9]:
df = pd.read_pickle('Data/DT/DT.pkl')

In [10]:
count = collections.Counter([item for sublist in df['persons_unique'] for item in sublist])

In [11]:
count.most_common()[:20]

[('Sheikh Hasina', 4543),
 ('Khaleda Zia', 2616),
 ('Bangabandhu Sheikh Mujibur Rahman', 739),
 ('Abdul Hamid', 653),
 ('Mirza Fakhrul Islam Alamgir', 639),
 ('Mizanur Rahman', 551),
 ('Tarique Rahman', 477),
 ('Obaidul Quader', 433),
 ('Motiur Rahman Nizami', 395),
 ('Abul Kalam', 393),
 ('Ershad', 353),
 ('Abul Kalam Azad', 341),
 ('Ziaur Rahman', 334),
 ('Abul Maal Abdul Muhith', 334),
 ('Anwar Hossain', 304),
 ('Mahbubey Alam', 299),
 ('Habibur Rahman', 299),
 ('Nurul Islam Nahid', 288),
 ('Asaduzzaman Khan Kamal', 276),
 ('Shahidul Haque', 264)]

In [12]:
df = pd.read_pickle('Data/DT/DT.pkl')

In [16]:
df['person_in_headline'] = np.vectorize(person_in_headline)(df['news_headline'], df['persons_unique'])

In [17]:
df['person_in_text'] = np.vectorize(person_in_text)(df['news_text'], df['persons_unique'])

In [18]:
count_person_in_text = sum(map(collections.Counter, tqdm(df['person_in_text'].values.tolist())), collections.Counter())

100%|██████████| 49055/49055 [12:10<00:00, 67.11it/s]


In [19]:
count_person_in_headline = sum(map(collections.Counter, tqdm(df['person_in_headline'].values.tolist())), collections.Counter())

100%|██████████| 49055/49055 [00:22<00:00, 2134.80it/s]


In [20]:
df_text = pd.DataFrame.from_dict(count_person_in_text, orient='index')
df_headline = pd.DataFrame.from_dict(count_person_in_headline, orient='index')

In [23]:
pagerank = pd.DataFrame.from_dict(pr_G, orient='index')
pagerank = pagerank.reset_index()
pagerank = pagerank.sort_values(by='index')
pagerank.columns = ['name', 'pagerank']

degree_centrality = pd.DataFrame.from_dict(degree_centrality_G, orient='index')
degree_centrality = degree_centrality.reset_index()
degree_centrality = degree_centrality.sort_values(by='index')
degree_centrality.columns = ['name', 'degree_centrality']

G.remove_edges_from(G.selfloop_edges())
core = nx.core_number(G)
core_periphery = pd.DataFrame.from_dict(core, orient='index')
core_periphery = core_periphery.reset_index()
core_periphery = core_periphery.sort_values(by='index')
core_periphery.columns = ['name', 'core_periphery']

count = dict(count)
person_count = pd.DataFrame.from_dict(count, orient='index')
person_count = person_count.reset_index()
person_count = person_count.sort_values(by='index')
person_count.columns = ['name', 'count']

df_headline = df_headline.reset_index()
df_headline = df_headline.sort_values(by='index')
df_headline.columns = ['name', 'headline_count']

df_text = df_text.reset_index()
df_text = df_text.sort_values(by='index')
df_text.columns = ['name', 'text_count']

In [24]:
ml = pd.DataFrame()

ml = pd.merge(pagerank, core_periphery, on="name")
ml = pd.merge(ml, degree_centrality, on="name")
ml = pd.merge(ml, person_count, on="name", how="left")
ml = pd.merge(ml, df_headline, on="name", how="left")
ml = pd.merge(ml, df_text, on="name", how="left")
ml.fillna(0, inplace=True)

In [25]:
ml.drop(ml.index[0:9], inplace=True)
ml.reset_index(drop=True, inplace=True)
ml.drop(ml.index[-7:], inplace=True)

In [26]:
# ml.to_pickle('Data/Processed Data/ML/DT-ml.pkl')

## Daily Star

In [27]:
df = pd.read_pickle('Data/Processed Data/Networks/DS-network.pkl')

In [28]:
G = nx.Graph()

In [29]:
network_type = 'person'
count = 0
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    if row['type0'] in network_type and row['type1'] in network_type:
        # print(row['entity0'], row['entity1'])
        count += 1
        G.add_edge(row['entity0'], row['entity1'])
        
print(count)

100%|██████████| 914208/914208 [00:59<00:00, 15437.90it/s]

170661





In [30]:
print(len(G.nodes))
print(len(G.edges))

45684
123931


In [31]:
degree_centrality_G = nx.degree_centrality(G)
sorted(degree_centrality_G.items(), key = lambda x:x[1], reverse= True)[:20]

[('Khan', 0.013133988573429942),
 ('Barack Obama', 0.012980758706739925),
 ('Rahman', 0.012192719392334128),
 ('Ali', 0.009872381411028173),
 ('Ahmed', 0.009872381411028173),
 ('Clinton', 0.009390801830002408),
 ('Nazrul', 0.008953002210888076),
 ('Sheikh Hasina', 0.00886544228706521),
 ('Rab', 0.008734102401330912),
 ('Hossain', 0.008405752686995163),
 ('Bush', 0.00746448350589935),
 ('Singh', 0.007136133791563601),
 ('Gaddafi', 0.006829674058183569),
 ('Khaleda Zia', 0.006216754591423505),
 ('Hillary Clinton', 0.0061510846485563556),
 ('Musharraf', 0.005778954972309174),
 ('Assad', 0.005275485410327693),
 ('Chowdhury', 0.005209815467460543),
 ('Abbas', 0.005056585600770528),
 ('Manmohan Singh', 0.004947135695991945)]

In [32]:
pr_G = nx.pagerank(G)
sorted(pr_G.items(), key = lambda x:x[1], reverse= True)[:20]

[('Khan', 0.0020221291101969676),
 ('Barack Obama', 0.0018603863764148833),
 ('Rahman', 0.0016470698180094314),
 ('Ali', 0.00143925353838546),
 ('Ahmed', 0.0013892710589818492),
 ('Clinton', 0.001359444219986478),
 ('Hossain', 0.001194355049643284),
 ('Rab', 0.0011939810349725446),
 ('Gaddafi', 0.0011799060070162564),
 ('Sheikh Hasina', 0.0011410663011670868),
 ('Nazrul', 0.0010968293976068345),
 ('Bush', 0.0010825096553345776),
 ('Singh', 0.0010689226742943228),
 ('Musharraf', 0.0008674203692742141),
 ('Assad', 0.0008148937602715171),
 ('Hillary Clinton', 0.0008097963993054698),
 ('Khaleda Zia', 0.0007314108114951344),
 ('Chowdhury', 0.0007280814787891742),
 ('Abbas', 0.0007247929345799016),
 ('Kim', 0.0006925116346644017)]

In [33]:
G.remove_edges_from(G.selfloop_edges())
sorted(nx.core_number(G).items(), key= lambda x:x[1], reverse=True)[:20]

[('Shakti Chattapadhaya', 39),
 ('Shaheed Subedar Mailkiat Singh', 39),
 ('Mrinmoyee Bose', 39),
 ('Shukharanjan Sengupta', 39),
 ('Samar Ranjan Sen', 39),
 ('Maj Gen Lachhman Singh Lehl', 39),
 ('Waheeda', 39),
 ('Waheeda Rehman', 39),
 ('Ashok Ray', 39),
 ('Zainal Abedin', 39),
 ('Shalil Ghosh', 39),
 ('Rabindranath Chowdhury', 39),
 ('Swaroop Krishna Kaul', 39),
 ('Pranabranjan Ray', 39),
 ('Kishore Parekh', 39),
 ('Biswajit R Chatterjee', 39),
 ('Subhash Mukhopadhaya', 39),
 ('Maulana Syed Asad Madni', 39),
 ('Asghar Ali', 39),
 ('Aniruddha Ray', 39)]

In [34]:
df = pd.read_pickle('Data/DS/DS.pkl')

In [35]:
count = collections.Counter([item for sublist in df['ner_unique_person'] for item in sublist])

In [36]:
count.most_common()[:20]

[('Barack Obama', 4951),
 ('Sheikh Hasina', 3881),
 ('Rabindranath Tagore', 2764),
 ('Khaleda Zia', 2283),
 ('Rahman', 1949),
 ('Bangabandhu Sheikh Mujibur Rahman', 1933),
 ('Rab', 1456),
 ('Manmohan Singh', 1375),
 ('Hossain', 1316),
 ('Nazrul', 1310),
 ('Hillary Clinton', 1257),
 ('Ahmed', 1179),
 ('Ali', 1110),
 ('Mizanur Rahman', 979),
 ('Bashar al-Assad', 943),
 ('Narendra Modi', 942),
 ('Abul Kalam Azad', 909),
 ('Rafiqul Islam', 899),
 ('Khan', 885),
 ('Hamid Karzai', 858)]

In [37]:
df['person_in_headline'] = np.vectorize(person_in_headline)(df['title'], df['ner_unique_person'])
df['person_in_text'] = np.vectorize(person_in_text)(df['content'], df['ner_unique_person'])

In [38]:
count_person_in_headline = sum(map(collections.Counter, tqdm(df['person_in_headline'].values.tolist())), collections.Counter())

100%|██████████| 165236/165236 [08:55<00:00, 308.65it/s]


In [39]:
df_headline = pd.DataFrame.from_dict(count_person_in_headline, orient='index')

In [45]:
# count_person_in_text = sum(map(collections.Counter, tqdm(df['person_in_text'].values.tolist())), collections.Counter())
# df_text = pd.DataFrame.from_dict(count_person_in_text, orient='index')

In [40]:
df_text = pd.read_pickle('Data/Processed Data/Misc/DS-Text-Count.pkl')

In [41]:
pagerank = pd.DataFrame.from_dict(pr_G, orient='index')
pagerank = pagerank.reset_index()
pagerank = pagerank.sort_values(by='index')
pagerank.columns = ['name', 'pagerank']

degree_centrality = pd.DataFrame.from_dict(degree_centrality_G, orient='index')
degree_centrality = degree_centrality.reset_index()
degree_centrality = degree_centrality.sort_values(by='index')
degree_centrality.columns = ['name', 'degree_centrality']

G.remove_edges_from(G.selfloop_edges())
core = nx.core_number(G)

core_periphery = pd.DataFrame.from_dict(core, orient='index')
core_periphery = core_periphery.reset_index()
core_periphery = core_periphery.sort_values(by='index')
core_periphery.columns = ['name', 'core_periphery']

count = dict(count)
person_count = pd.DataFrame.from_dict(count, orient='index')
person_count = person_count.reset_index()
person_count = person_count.sort_values(by='index')
person_count.columns = ['name', 'count']

df_headline = df_headline.reset_index()
df_headline = df_headline.sort_values(by='index')
df_headline.columns = ['name', 'headline_count']

df_text = df_text.reset_index()
df_text = df_text.sort_values(by='index')
df_text.columns = ['name', 'text_count']

In [42]:
ml = pd.DataFrame()
ml = pd.merge(pagerank, core_periphery, on="name")
ml = pd.merge(ml, degree_centrality, on="name")
ml = pd.merge(ml, person_count, on="name", how="left")
ml = pd.merge(ml, df_headline, on="name", how="left")
ml = pd.merge(ml, df_text, on="name", how="left")
ml.fillna(0, inplace=True)

In [43]:
ml.drop(ml.index[0:86], inplace=True)
ml.reset_index(drop=True, inplace=True)
ml.drop(ml.index[-153:], inplace=True)

In [44]:
# ml.to_pickle('Data/Processed Data/ML/DS-ml.pkl')

## NA

In [46]:
df = pd.read_pickle('Data/Processed Data/Networks/NA-network.pkl')

In [47]:
G = nx.Graph()

In [48]:
network_type = 'person'
exclusions = ['Rakhine', 'Test', 'Dhaka', 'Bangla', 'Bangladesh', 'Narayanganj', 'AL', 'Gulshan', 'Dhanmondi', 'Bogra', 'Rongpur', 'Barisal', 'Rangpur', 'Comilla', 'Chelsea', 'Rajshahi', 'Mymensingh', 'Sadar', 'Sylhet', 'Cox', 'Twitter', 'Barca', 'Kolkata', 'Teknaf', 'Ukhiya', 'Bandarban', 'Tripura', 'Abahani']
count = 0
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    if row['type0'] in network_type and row['type1'] in network_type:
        if row['entity0'] not in exclusions and row['entity1'] not in exclusions:
            # print(row['entity0'], row['entity1'])
            count += 1
            G.add_edge(row['entity0'], row['entity1'])
        
print(count)

100%|██████████| 820506/820506 [00:50<00:00, 16353.14it/s]

79369





In [49]:
print(len(G.nodes))
print(len(G.edges))

23874
64591


In [50]:
degree_centrality_G = nx.degree_centrality(G)
sorted(degree_centrality_G.items(), key = lambda x:x[1], reverse= True)[:20]

[('Sheikh Hasina', 0.02530054873706698),
 ('Donald Trump', 0.02127926946759938),
 ('Khaleda Zia', 0.00959242659070917),
 ('Putin', 0.00908976668202572),
 ('Clinton', 0.00908976668202572),
 ('Rahman', 0.007875005236040716),
 ('Khan', 0.007707451933146233),
 ('Ali', 0.00758178695597537),
 ('Mustafizur Rahman', 0.006618355464332091),
 ('Quader', 0.006325137184266745),
 ('Tofail Ahmed', 0.006241360532819503),
 ('Suu Kyi', 0.006157583881372262),
 ('Abdul Hamid', 0.00607380722992502),
 ('Nasir', 0.005906253927030537),
 ('Barack Obama', 0.005696812298412433),
 ('Abdur Razzak', 0.0056549239726888115),
 ('Bush', 0.005529258995517949),
 ('Mujib', 0.005403594018347087),
 ('Mourinho', 0.0052779290411762245),
 ('Aung San Suu Kyi', 0.005194152389728983)]

In [51]:
pr_G = nx.pagerank(G)
sorted(pr_G.items(), key = lambda x:x[1], reverse= True)[:20]

[('Sheikh Hasina', 0.00370014149136523),
 ('Donald Trump', 0.003154837393952081),
 ('Clinton', 0.0013093035154407673),
 ('Putin', 0.0012997899963753153),
 ('Khaleda Zia', 0.0011680900178250376),
 ('Rahman', 0.0010655812015762437),
 ('Suu Kyi', 0.0010376351665081166),
 ('Khan', 0.0010131492923911812),
 ('Ali', 0.000987753025942132),
 ('Quader', 0.0008565130683169668),
 ('Barack Obama', 0.0008293584893295223),
 ('Aung San Suu Kyi', 0.0008197711494204998),
 ('Tofail Ahmed', 0.000715494818308283),
 ('Bush', 0.0007082834379158772),
 ('Kim', 0.0007014877986340839),
 ('Abdul Hamid', 0.0006913300533997925),
 ('Jamaat', 0.0006796141348578668),
 ('Md', 0.0006772518321823963),
 ('Vladimir Putin', 0.0006442993711699444),
 ('Oscar', 0.0006373263175412666)]

In [52]:
G.remove_edges_from(G.selfloop_edges())
sorted(nx.core_number(G).items(), key= lambda x:x[1], reverse=True)[:20]

[('Sarwar Jahan', 35),
 ('Uma Khan', 35),
 ('Mesbah Uddin Ahmed', 35),
 ('Mohiuddin Ahmed', 35),
 ('Gitosree Chowdhury', 35),
 ('Shafiur Rahman Dulu', 35),
 ('Abu Bakar Siddique', 35),
 ('Anu Islam', 35),
 ('Chittaranjan Bhuiyan', 35),
 ('Jahirul Haque', 35),
 ('Milon Bhattacharjee', 35),
 ('Firoj Chowdhury', 35),
 ('Sabuj Chakrabarty', 35),
 ('Sujit Roy', 35),
 ('Niranjan Adhikari', 35),
 ('Sajeda Khatun', 35),
 ('Md Nazrul Islam', 35),
 ('Mohammed Niaz Uddin', 35),
 ('M Mamun', 35),
 ('Manjusri Niyogi', 35)]

In [53]:
df = pd.read_pickle('Data/New Age/NewAge_ent.pkl')

In [54]:
count = collections.Counter([item for sublist in df['person_entities'] for item in sublist])

In [55]:
count.most_common()[:20]

[('Bangladesh', 3010),
 ('Sheikh Hasina', 2513),
 ('Donald Trump', 1871),
 ('Dhaka', 1516),
 ('Khaleda Zia', 995),
 ('Test', 982),
 ('Barack Obama', 668),
 ('Abul Maal Abdul Muhith', 646),
 ('Obaidul Quader', 622),
 ('Rakhine', 611),
 ('Bangabandhu Sheikh Mujibur Rahman', 570),
 ('Sylhet', 561),
 ('Abdul Hamid', 552),
 ('Mirza Fakhrul Islam Alamgir', 537),
 ('Narayanganj', 509),
 ('Narendra Modi', 495),
 ('AL', 433),
 ('Gulshan', 421),
 ('Sakib al Hasan', 413),
 ('Dhanmondi', 382)]

In [56]:
df['person_in_headline'] = np.vectorize(person_in_headline)(df['title'], df['person_entities'])
df['person_in_text'] = np.vectorize(person_in_text)(df['news_content'], df['person_entities'])

In [57]:
# count_person_in_text = sum(map(collections.Counter, tqdm(df['person_in_text'].values.tolist())), collections.Counter())
# df_text = pd.DataFrame.from_dict(count_person_in_text, orient='index')
df_text = pd.read_pickle('Data/Processed Data/Misc/NA-Text-Count.pkl')

In [58]:
count_person_in_headline = sum(map(collections.Counter, tqdm(df['person_in_headline'].values.tolist())), collections.Counter())
df_headline = pd.DataFrame.from_dict(count_person_in_headline, orient='index')

100%|██████████| 32893/32893 [00:31<00:00, 1031.75it/s]


In [59]:
pagerank = pd.DataFrame.from_dict(pr_G, orient='index')
pagerank = pagerank.reset_index()
pagerank = pagerank.sort_values(by='index')
pagerank.columns = ['name', 'pagerank']

degree_centrality = pd.DataFrame.from_dict(degree_centrality_G, orient='index')
degree_centrality = degree_centrality.reset_index()
degree_centrality = degree_centrality.sort_values(by='index')
degree_centrality.columns = ['name', 'degree_centrality']

G.remove_edges_from(G.selfloop_edges())
core = nx.core_number(G)

core_periphery = pd.DataFrame.from_dict(core, orient='index')
core_periphery = core_periphery.reset_index()
core_periphery = core_periphery.sort_values(by='index')
core_periphery.columns = ['name', 'core_periphery']

count = dict(count)
person_count = pd.DataFrame.from_dict(count, orient='index')
person_count = person_count.reset_index()
person_count = person_count.sort_values(by='index')
person_count.columns = ['name', 'count']

df_headline = df_headline.reset_index()
df_headline = df_headline.sort_values(by='index')
df_headline.columns = ['name', 'headline_count']

df_text = df_text.reset_index()
df_text = df_text.sort_values(by='index')
df_text.columns = ['name', 'text_count']

In [60]:
ml = pd.DataFrame()
ml = pd.merge(pagerank, core_periphery, on="name")
ml = pd.merge(ml, degree_centrality, on="name")
ml = pd.merge(ml, person_count, on="name", how="left")
ml = pd.merge(ml, df_headline, on="name", how="left")
ml = pd.merge(ml, df_text, on="name", how="left")
ml.fillna(0, inplace=True)

In [61]:
ml.drop(ml.index[0:59], inplace=True)
ml.reset_index(drop=True, inplace=True)
ml.drop(ml.index[-151:], inplace=True)

In [62]:
# ml.to_pickle('Data/Processed Data/ML/NA-ml.pkl')

# Making ML ready datasets in time intervals

## DT

In [4]:
df = pd.read_pickle('Data/Processed Data/Networks/DT-network.pkl')

df['timestamp'] = df['timestamp'].apply(lambda x: x['$date'])
df['timestamp'] = pd.to_datetime(df['timestamp'])

### 6, 3, 1 month interval (replaced cells using month because notebook was getting too long)

In [5]:
i = df['timestamp'].min()
max_date_counter = 0
while i < df['timestamp'].max() + datetime.timedelta(weeks=24):
    max_date_counter += 1
    i = i + datetime.timedelta(weeks=24)

print(max_date_counter)
time = []
time.append(df['timestamp'].min())
i = time[0]
for i in range(1,max_date_counter):
    time.append(time[i-1] + datetime.timedelta(weeks=24))

8


In [6]:
Graph_list = []

network_type = 'person'
count = 0
for i in range(len(time)-1):
    G = nx.Graph()
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        if row['timestamp'] > time[i] and row['timestamp'] <= time[i+1]:
            if row['type0'] in network_type and row['type1'] in network_type:
                count += 1
                G.add_edge(row['entity0'], row['entity1'])
    Graph_list.append(G)

print(count)

100%|██████████| 247369/247369 [00:17<00:00, 14542.14it/s]
100%|██████████| 247369/247369 [00:16<00:00, 14773.22it/s]
100%|██████████| 247369/247369 [00:16<00:00, 14690.48it/s]
100%|██████████| 247369/247369 [00:16<00:00, 15248.82it/s]
100%|██████████| 247369/247369 [00:16<00:00, 15353.58it/s]
100%|██████████| 247369/247369 [00:15<00:00, 16028.52it/s]
100%|██████████| 247369/247369 [00:15<00:00, 15973.25it/s]

24925





In [7]:
degree_centrality_list = ()
pagerank_list = ()
core_list = ()

for G in Graph_list:
    degree_centrality_G = nx.degree_centrality(G)
    pr_G = nx.pagerank(G)
    G.remove_edges_from(G.selfloop_edges())
    core = nx.core_number(G)
    
    degree_centrality_list += (degree_centrality_G,)
    pagerank_list += (pr_G,)
    core_list += (core,)

In [8]:
dt = pd.read_pickle('Data/DT/DT.pkl')
dt['news_publish_date'] = dt['news_publish_date'].apply(lambda x: x['$date'])
dt['news_publish_date'] = pd.to_datetime(dt['news_publish_date'])
dt = dt.set_index(dt['news_publish_date'])
dt = dt.sort_index()

In [9]:
dt_list = []
for i in range(len(time)-1):
    temp = dt[time[i]:time[i+1]]
    dt_list.append(temp)

In [10]:
count_list = [[], [], []] # List structure is: count, count_in_text, count_in_headline

for dt in dt_list:
    dt['person_in_headline'] = np.vectorize(person_in_headline)(dt['news_headline'], dt['persons_unique'])
    dt['person_in_text'] = np.vectorize(person_in_text)(dt['news_text'], dt['persons_unique'])
    count_person_in_text = sum(map(collections.Counter, tqdm(dt['person_in_text'].values.tolist())), collections.Counter())
    count_person_in_headline = sum(map(collections.Counter, tqdm(dt['person_in_headline'].values.tolist())), collections.Counter())
    count = collections.Counter([item for sublist in dt['persons_unique'] for item in sublist])
    
    count_list[0].append(count)
    count_list[1].append(count_person_in_headline)
    count_list[2].append(count_person_in_text)

100%|██████████| 5379/5379 [00:11<00:00, 485.61it/s]
100%|██████████| 5379/5379 [00:00<00:00, 11580.47it/s]
100%|██████████| 8855/8855 [00:25<00:00, 350.45it/s]
100%|██████████| 8855/8855 [00:00<00:00, 9158.09it/s]
100%|██████████| 8675/8675 [00:26<00:00, 332.33it/s]
100%|██████████| 8675/8675 [00:00<00:00, 9995.88it/s] 
100%|██████████| 8187/8187 [00:25<00:00, 317.48it/s]
100%|██████████| 8187/8187 [00:01<00:00, 7183.28it/s]
100%|██████████| 6137/6137 [00:14<00:00, 430.39it/s]
100%|██████████| 6137/6137 [00:00<00:00, 11486.43it/s]
100%|██████████| 5900/5900 [00:13<00:00, 446.12it/s]
100%|██████████| 5900/5900 [00:00<00:00, 9134.47it/s]
100%|██████████| 5922/5922 [00:13<00:00, 454.81it/s]
100%|██████████| 5922/5922 [00:00<00:00, 11054.63it/s]


In [11]:
for i in tqdm(range(len(dt_list))):
    pagerank = pd.DataFrame.from_dict(pagerank_list[i], orient='index')
    pagerank = pagerank.reset_index()
    pagerank = pagerank.sort_values(by='index')
    pagerank.columns = ['name', 'pagerank']
    
    degree_centrality = pd.DataFrame.from_dict(degree_centrality_list[i], orient='index')
    degree_centrality = degree_centrality.reset_index()
    degree_centrality = degree_centrality.sort_values(by='index')
    degree_centrality.columns = ['name', 'degree_centrality']
    
    core_periphery = pd.DataFrame.from_dict(core_list[i], orient='index')
    core_periphery = core_periphery.reset_index()
    core_periphery = core_periphery.sort_values(by='index')
    core_periphery.columns = ['name', 'core_periphery']
    
    count = dict(count_list[0][i])
    person_count = pd.DataFrame.from_dict(count, orient='index')
    person_count = person_count.reset_index()
    person_count = person_count.sort_values(by='index')
    person_count.columns = ['name', 'count']
    
    df_text = pd.DataFrame.from_dict(count_list[1][i], orient='index')
    df_text = df_text.reset_index()
    df_text = df_text.sort_values(by='index')
    df_text.columns = ['name', 'text_count']
    
    df_headline = pd.DataFrame.from_dict(count_list[2][i], orient='index')
    df_headline = df_headline.reset_index()
    df_headline = df_headline.sort_values(by='index')
    df_headline.columns = ['name', 'headline_count']
    
    ml = pd.DataFrame()
    ml = pd.merge(pagerank, core_periphery, on="name")
    ml = pd.merge(ml, degree_centrality, on="name")
    ml = pd.merge(ml, person_count, on="name", how="left")
    ml = pd.merge(ml, df_headline, on="name", how="left")
    ml = pd.merge(ml, df_text, on="name", how="left")
    
    # ml.to_pickle('Data/Processed Data/Intervals/DT/6 month/ml_'+str(i+1)+'.pkl')

100%|██████████| 7/7 [00:00<00:00, 19.77it/s]


## DS

In [5]:
ds = pd.read_pickle('Data/Processed Data/Networks/DS-network.pkl')
ds['timestamp'] = ds['timestamp'].progress_apply(fix_date)

100%|██████████| 914208/914208 [01:40<00:00, 9141.41it/s]


### 6, 3, 1 month interval (replaced cells using month because notebook was getting too long)

In [6]:
i = ds['timestamp'].min()
max_date_counter = 0
while i < ds['timestamp'].max() + datetime.timedelta(weeks=24):
    max_date_counter += 1
    i = i + datetime.timedelta(weeks=24)

print(max_date_counter)
time = []
time.append(ds['timestamp'].min())
i = time[0]
for i in range(1,max_date_counter):
    time.append(time[i-1] + datetime.timedelta(weeks=24))

23


In [7]:
ds = ds.set_index(ds['timestamp'])
ds = ds.sort_index()

In [8]:
ds_list = []
for i in range(len(time)-1):
    temp = ds[time[i]:time[i+1]]
    ds_list.append(temp)

In [9]:
Graph_list = []

network_type = 'person'
count = 0

for df in tqdm(ds_list):
    G = nx.Graph()
    for _, row in df.iterrows():
        if row['type0'] in network_type and row['type1'] in network_type:
            count += 1
            G.add_edge(row['entity0'], row['entity1'])
    Graph_list.append(G)

print(count)

100%|██████████| 22/22 [08:31<00:00, 23.25s/it]

171470





In [10]:
degree_centrality_list = ()
pagerank_list = ()
core_list = ()

for G in Graph_list:
    degree_centrality_G = nx.degree_centrality(G)
    pr_G = nx.pagerank(G)
    G.remove_edges_from(G.selfloop_edges())
    core = nx.core_number(G)
    
    degree_centrality_list += (degree_centrality_G,)
    pagerank_list += (pr_G,)
    core_list += (core,)

In [11]:
ds = pd.read_pickle('Data/DS/DS.pkl')
ds['date_published'] = ds['date_published'].apply(lambda x: x['$date'])
ds['date_published'] = pd.to_datetime(ds['date_published'])
ds = ds.set_index(ds['date_published'])
ds = ds.sort_index()

In [12]:
ds_list = []
for i in range(len(time)-1):
    temp = ds[time[i]:time[i+1]]
    ds_list.append(temp)

In [13]:
count_list = [[], [], []] # List structure is: count, count_in_text, count_in_headline

for ds in tqdm(ds_list):
    ds['person_in_headline'] = np.vectorize(person_in_headline)(ds['title'], ds['ner_unique_person'])
    count_person_in_text = sum(map(collections.Counter, ds['ner_person'].values.tolist()), collections.Counter())
    count_person_in_headline = sum(map(collections.Counter, ds['person_in_headline'].values.tolist()), collections.Counter())
    count = collections.Counter([item for sublist in ds['ner_unique_person'] for item in sublist])
    
    count_list[0].append(count)
    count_list[1].append(count_person_in_headline)
    count_list[2].append(count_person_in_text)

100%|██████████| 22/22 [21:30<00:00, 58.65s/it]


In [14]:
for i in tqdm(range(len(ds_list))):
    pagerank = pd.DataFrame.from_dict(pagerank_list[i], orient='index')
    pagerank = pagerank.reset_index()
    pagerank = pagerank.sort_values(by='index')
    pagerank.columns = ['name', 'pagerank']
    
    degree_centrality = pd.DataFrame.from_dict(degree_centrality_list[i], orient='index')
    degree_centrality = degree_centrality.reset_index()
    degree_centrality = degree_centrality.sort_values(by='index')
    degree_centrality.columns = ['name', 'degree_centrality']
    
    core_periphery = pd.DataFrame.from_dict(core_list[i], orient='index')
    core_periphery = core_periphery.reset_index()
    core_periphery = core_periphery.sort_values(by='index')
    core_periphery.columns = ['name', 'core_periphery']
    
    count = dict(count_list[0][i])
    person_count = pd.DataFrame.from_dict(count, orient='index')
    person_count = person_count.reset_index()
    person_count = person_count.sort_values(by='index')
    person_count.columns = ['name', 'count']
    
    df_text = pd.DataFrame.from_dict(count_list[1][i], orient='index')
    df_text = df_text.reset_index()
    df_text = df_text.sort_values(by='index')
    df_text.columns = ['name', 'text_count']
    
    df_headline = pd.DataFrame.from_dict(count_list[2][i], orient='index')
    df_headline = df_headline.reset_index()
    df_headline = df_headline.sort_values(by='index')
    df_headline.columns = ['name', 'headline_count']
    
    ml = pd.DataFrame()
    ml = pd.merge(pagerank, core_periphery, on="name")
    ml = pd.merge(ml, degree_centrality, on="name")
    ml = pd.merge(ml, person_count, on="name", how="left")
    ml = pd.merge(ml, df_headline, on="name", how="left")
    ml = pd.merge(ml, df_text, on="name", how="left")
    
    # ml.to_pickle('Data/Processed Data/Intervals/DS/6 month/ml_'+str(i+1)+'.pkl')

100%|██████████| 22/22 [00:02<00:00,  9.04it/s]


## NA

In [15]:
na = pd.read_pickle('Data/Processed Data/Networks/NA-network.pkl')

In [16]:
na['timestamp'] = pd.to_datetime(na['timestamp'])

### 6, 3, 1 month interval (replaced cells using month because notebook was getting too long)

In [17]:
i = na['timestamp'].min()
max_date_counter = 0
while i < na['timestamp'].max() + datetime.timedelta(weeks=24):
    max_date_counter += 1
    i = i + datetime.timedelta(weeks=24)

print(max_date_counter)
time = []
time.append(na['timestamp'].min())
i = time[0]
for i in range(1,max_date_counter):
    time.append(time[i-1] + datetime.timedelta(weeks=24))

5


In [18]:
na = na.set_index(na['timestamp'])
na = na.sort_index()

In [19]:
na_list = []
for i in range(len(time)-1):
    temp = na[time[i]:time[i+1]]
    na_list.append(temp)

In [20]:
Graph_list = []

network_type = 'person'
count = 0

for df in tqdm(na_list):
    G = nx.Graph()
    for _, row in df.iterrows():
        if row['type0'] in network_type and row['type1'] in network_type:
            count += 1
            G.add_edge(row['entity0'], row['entity1'])
    Graph_list.append(G)

print(count)

100%|██████████| 4/4 [00:51<00:00, 12.93s/it]

86532





In [21]:
degree_centrality_list = ()
pagerank_list = ()
core_list = ()

for G in Graph_list:
    degree_centrality_G = nx.degree_centrality(G)
    pr_G = nx.pagerank(G)
    G.remove_edges_from(G.selfloop_edges())
    core = nx.core_number(G)
    
    degree_centrality_list += (degree_centrality_G,)
    pagerank_list += (pr_G,)
    core_list += (core,)

In [22]:
na = pd.read_pickle('Data/New Age/NewAge_ent.pkl')
na['date_published'] = pd.to_datetime(na['date_published'])
na = na.set_index(na['date_published'])
na = na.sort_index()

In [23]:
na_list = []
for i in range(len(time)-1):
    temp = na[time[i]:time[i+1]]
    na_list.append(temp)

In [24]:
count_list = [[], [], []] # List structure is: count, count_in_text, count_in_headline

for na in tqdm(na_list):
    na['person_in_headline'] = np.vectorize(person_in_headline)(na['title'], na['person_entities'])
    na['person_in_text'] = np.vectorize(person_in_text)(na['news_content'], na['person_entities'])
    count_person_in_text = sum(map(collections.Counter, na['person_in_text'].values.tolist()), collections.Counter())
    count_person_in_headline = sum(map(collections.Counter, na['person_in_headline'].values.tolist()), collections.Counter())
    count = collections.Counter([item for sublist in na['person_entities'] for item in sublist])
    
    count_list[0].append(count)
    count_list[1].append(count_person_in_headline)
    count_list[2].append(count_person_in_text)

100%|██████████| 4/4 [05:34<00:00, 83.72s/it] 


In [26]:
for i in tqdm(range(len(na_list))):
    pagerank = pd.DataFrame.from_dict(pagerank_list[i], orient='index')
    pagerank = pagerank.reset_index()
    pagerank = pagerank.sort_values(by='index')
    pagerank.columns = ['name', 'pagerank']
    
    degree_centrality = pd.DataFrame.from_dict(degree_centrality_list[i], orient='index')
    degree_centrality = degree_centrality.reset_index()
    degree_centrality = degree_centrality.sort_values(by='index')
    degree_centrality.columns = ['name', 'degree_centrality']
    
    core_periphery = pd.DataFrame.from_dict(core_list[i], orient='index')
    core_periphery = core_periphery.reset_index()
    core_periphery = core_periphery.sort_values(by='index')
    core_periphery.columns = ['name', 'core_periphery']
    
    count = dict(count_list[0][i])
    person_count = pd.DataFrame.from_dict(count, orient='index')
    person_count = person_count.reset_index()
    person_count = person_count.sort_values(by='index')
    person_count.columns = ['name', 'count']
    
    df_text = pd.DataFrame.from_dict(count_list[1][i], orient='index')
    df_text = df_text.reset_index()
    df_text = df_text.sort_values(by='index')
    df_text.columns = ['name', 'text_count']
    
    df_headline = pd.DataFrame.from_dict(count_list[2][i], orient='index')
    df_headline = df_headline.reset_index()
    df_headline = df_headline.sort_values(by='index')
    df_headline.columns = ['name', 'headline_count']
    
    ml = pd.DataFrame()
    ml = pd.merge(pagerank, core_periphery, on="name")
    ml = pd.merge(ml, degree_centrality, on="name")
    ml = pd.merge(ml, person_count, on="name", how="left")
    ml = pd.merge(ml, df_headline, on="name", how="left")
    ml = pd.merge(ml, df_text, on="name", how="left")
    
    # ml.to_pickle('Data/Processed Data/Intervals/NA/6 month/ml_'+str(i+1)+'.pkl')

100%|██████████| 4/4 [00:00<00:00,  6.79it/s]
