In [1]:
import pandas as pd
from tqdm import tqdm
import re
import spacy
from tqdm import tqdm
from unidecode import unidecode
from dateutil import parser
from datetime import datetime
import collections
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en')

In [2]:
def name_disambiguation(name):
    if (name == "Khaleda" or name == "Zia" or name == "Begum Khaleda Zia"):
        return "Khaleda Zia"
    if (name == "Hasina" or name == "Sheikh"):
        return "Sheikh Hasina"
    if (name == "Fakhrul"):
        return "Mirza Fakhrul Islam Alamgir"
    if (name == "Muhith" or name == "AMA Muhith" or name == "MA Muhith"):
        return "Abul Maal Abdul Muhith"
    if (name == "Nizami" or name == "Motiur Rahman"):
        return "Motiur Rahman Nizami"
    if (name == "Modi"):
        return "Narendra Modi"
    if (name == "Bangabandhu" or name == "Sheikh Mujib" or name == "Sheikh Mujib" or name == "Bangabandhu Sheikh Mujibur" or name == "Sheikh Mujibur Rahman"):
        return "Bangabandhu Sheikh Mujibur Rahman"
    if (name == "Tarique"):
        return "Tarique Rahman"
    if (name == "Avijit"):
        return "Avijit Roy"
    if (name == "Mozena"):
        return "Dan Mozena"
    if (name == "Yunus" or name == "Mohammad Yunus"):
        return "Muhammad Yunus"

### Date limit: 2013-07-08 to 2016-06-14 on DT, New Age and Daily Sun

In [3]:
dt = pd.read_json('Data/DT/bd_news_dt.json')
dstar = pd.read_json('Data/DS/news_db.json', lines=True)
dsun = pd.read_pickle('Data/Daily Sun/DailySun_ent_1.pkl')

In [4]:
def conv_to_datetime(date_dict):
    new_datetime = parser.parse(list(date_dict.items())[0][1], ignoretz=True)
    new_datetime = new_datetime.replace(hour=0, minute=0, second=0, microsecond=0)
    return new_datetime

In [5]:
dt['news_publish_date'] = dt['news_publish_date'].apply(conv_to_datetime)

In [6]:
dstar['date_published'] = dstar['date_published'].apply(conv_to_datetime)

In [7]:
dsun['date_published'] = pd.to_datetime(dsun['date_published'])

In [8]:
start_date = datetime(2013, 7, 8)
end_date = datetime(2016, 6, 14)

In [9]:
dt_new = dt.loc[(dt['news_publish_date']>=start_date) & (dt['news_publish_date']<=end_date)]

In [10]:
dstar_new = dstar.loc[(dstar['date_published']>=start_date) & (dstar['date_published']<=end_date)]

In [11]:
dsun_new = dsun.loc[(dsun['date_published']>=start_date) & (dsun['date_published']<=end_date)]

In [12]:
print("dt: {}, dstar: {}, dsun{}".format(dt_new.shape, dstar.shape, dt.shape))

dt: (46611, 17), dstar: (165236, 34), dsun(49055, 17)


In [13]:
dt_new = dt_new.reset_index(drop=True)
dstar_new = dstar_new.reset_index(drop=True)
dsun_new = dsun_new.reset_index(drop=True)

## Count

In [14]:
dsun_counts = dsun_new.groupby('date_published').sum().agg({
    'location_entities': collections.Counter, 
    'organization_entities': collections.Counter,
    'person_entities': collections.Counter
})

In [15]:
# Sorts Counter object

dsun_counts['location_entities'] = dsun_counts['location_entities'].apply(lambda x: x.most_common())
dsun_counts['organization_entities'] = dsun_counts['organization_entities'].apply(lambda x: x.most_common())
dsun_counts['person_entities'] = dsun_counts['person_entities'].apply(lambda x: x.most_common())

In [16]:
# Daily Star and DT doesn't work the same way for some absurd reason.
# Trying on a small dataset

# df = pd.read_json('Data/DS/test.json', lines=True)
# df['date_published'] = df['date_published'].apply(conv_to_datetime)
# df_counts = df.groupby('date_published').sum().agg({
#     'ner_unique_location': collections.Counter, 
#     'ner_unique_organization': collections.Counter,
#     'ner_unique_person': collections.Counter
# })

In [17]:
dstar_counts = pd.DataFrame()

In [18]:
dstar_counts['location_entities'] = dstar_new.groupby('date_published')['ner_unique_location'].sum().apply(collections.Counter, 1)
dstar_counts['organization_entities'] = dstar_new.groupby('date_published')['ner_unique_organization'].sum().apply(collections.Counter, 1)
dstar_counts['person_entities'] = dstar_new.groupby('date_published')['ner_unique_person'].sum().apply(collections.Counter, 1)

In [19]:
# Sorts Counter object


dstar_counts['location_entities'] = dstar_counts['location_entities'].apply(lambda x: x.most_common())
dstar_counts['organization_entities'] = dstar_counts['organization_entities'].apply(lambda x: x.most_common())
dstar_counts['person_entities'] = dstar_counts['person_entities'].apply(lambda x: x.most_common())

In [20]:
# Splits the news_ner_tags column, which contains dictionaries, into different columns based on dictionary keys
dt_new_modified = pd.concat([dt_new.drop(['news_ner_tags'], axis=1), dt_new['news_ner_tags'].apply(pd.Series)], axis=1)

In [21]:
dt_counts = pd.DataFrame()

In [22]:
dt_counts['location_entities'] = dt_new_modified.groupby('news_publish_date')['locations_unique'].sum().apply(collections.Counter, 1)
dt_counts['organization_entities'] = dt_new_modified.groupby('news_publish_date')['organizations_unique'].sum().apply(collections.Counter, 1)
dt_counts['person_entities'] = dt_new_modified.groupby('news_publish_date')['persons_unique'].sum().apply(collections.Counter, 1)

In [23]:
# Sorts Counter object

dt_counts['location_entities'] = dt_counts['location_entities'].apply(lambda x: x.most_common())
dt_counts['organization_entities'] = dt_counts['organization_entities'].apply(lambda x: x.most_common())
dt_counts['person_entities'] = dt_counts['person_entities'].apply(lambda x: x.most_common())

## TF-IDF

In [24]:
tfidf = TfidfVectorizer()

In [30]:
dsun_tfidf = pd.DataFrame()
tqdm.pandas()

In [31]:
dsun_new['location_entities'] =  dsun_new['location_entities'].progress_apply(lambda x: ' '.join(str(e) for e in x))

100%|██████████| 162060/162060 [00:00<00:00, 382815.38it/s]


In [32]:
dsun_new.head()

Unnamed: 0,_id,date_published,image,news_content,news_id,newspaper,reporter,section,url,location_entities,organization_entities,person_entities
0,{'$oid': '5af4592318ce422f881bcaf4'},2013-07-08,"{'src': None, 'caption': None}",ATN BANGLA 19:00 22:00 23:00 Channel i 7:00 ...,123864,dailysun,,/culturetainment,http://www.daily-sun.com/arcprint/details/1238...,,[],[]
1,{'$oid': '5af4592318ce422f881bcaf5'},2013-07-08,{'src': 'http://www.daily-sun.com/assets/news_...,Solo Art Exhibition By: Biren Shome At: Dhak...,123862,dailysun,,/culturetainment,http://www.daily-sun.com/arcprint/details/1238...,,[Dhaka Art Centre ],[Dhanmondi]
2,{'$oid': '5af4592418ce422f881bcaf7'},2013-07-08,{'src': 'http://www.daily-sun.com/assets/news_...,NTV 08:45 Bengali Film: Benam Badsha 13:30 ...,123861,dailysun,,/culturetainment,http://www.daily-sun.com/arcprint/details/1238...,Chaabi 14:00 Saraswatichandra Caribbean The ...,"[Van Helsing, The Three Musketeers, 13:05, NTV...","[21:30 Damages, Veera, 11:30 Veera, Yeh Rishta..."
3,{'$oid': '5af4592418ce422f881bcaf8'},2013-07-08,{'src': 'http://www.daily-sun.com/assets/news_...,RADIO TODAY Islamic Program 12:30 News 1:4...,123863,dailysun,,/culturetainment,http://www.daily-sun.com/arcprint/details/1238...,,"[ Islamic Program, VOA & RTD]",[Rupali Gaan]
4,{'$oid': '5af4592518ce422f881bcafa'},2013-07-08,{'src': 'http://www.daily-sun.com/assets/news_...,Victoria Beckham has revealed she struggles to...,123874,dailysun,,/culturetainment,http://www.daily-sun.com/arcprint/details/1238...,China Vogue,[],"[it\, Spice Girl - who, David Beckham -, Victo..."


In [33]:
dsun_tfidf['location_entities'] = dsun_new.groupby('date_published').progress_apply(tfidf.fit_transform(dsun_new['location_entities']))

  0%|          | 2/1074 [00:00<01:01, 17.44it/s]

TypeError: 'csr_matrix' object is not callable

In [34]:
df = pd.DataFrame({'Date': ['2018-09-01','2018-09-01 ','2018-09-02'],
               'documents': [['cats say, meow', 'dogs say woof', 'dogs chase cats'],['cats say meow', 'horses say neigh'],['lions go roar']]})


df['documents'] = df['documents'].apply(lambda x: ' '.join(str(e) for e in x))


In [35]:
v = TfidfVectorizer()

df['documents'] = df.groupby('Date').apply(v.fit_transform(df['documents']))
# x = v.fit_transform(df['documents'])
# df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
# res = pd.concat([df, df1], axis=1)

TypeError: unhashable type: 'csr_matrix'