In [1]:
import pandas as pd
from tqdm import tqdm
import re
import spacy
from tqdm import tqdm
from unidecode import unidecode
from dateutil import parser
from datetime import datetime
import collections
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en')

In [2]:
def name_disambiguation(name):
    if (name == "Khaleda" or name == "Zia" or name == "Begum Khaleda Zia"):
        return "Khaleda Zia"
    if (name == "Hasina" or name == "Sheikh"):
        return "Sheikh Hasina"
    if (name == "Fakhrul"):
        return "Mirza Fakhrul Islam Alamgir"
    if (name == "Muhith" or name == "AMA Muhith" or name == "MA Muhith"):
        return "Abul Maal Abdul Muhith"
    if (name == "Nizami" or name == "Motiur Rahman"):
        return "Motiur Rahman Nizami"
    if (name == "Modi"):
        return "Narendra Modi"
    if (name == "Bangabandhu" or name == "Sheikh Mujib" or name == "Sheikh Mujib" or name == "Bangabandhu Sheikh Mujibur" or name == "Sheikh Mujibur Rahman"):
        return "Bangabandhu Sheikh Mujibur Rahman"
    if (name == "Tarique"):
        return "Tarique Rahman"
    if (name == "Avijit"):
        return "Avijit Roy"
    if (name == "Mozena"):
        return "Dan Mozena"
    if (name == "Yunus" or name == "Mohammad Yunus"):
        return "Muhammad Yunus"

### Date limit: 2013-07-08 to 2016-06-14 on DT, Daily Star and Daily Sun

In [3]:
dt = pd.read_json('Data/DT/bd_news_dt.json')
dstar = pd.read_json('Data/DS/news_db.json', lines=True)
dsun = pd.read_pickle('Data/Daily Sun/DailySun_ent_1.pkl')

In [4]:
def conv_to_datetime(date_dict):
    new_datetime = parser.parse(list(date_dict.items())[0][1], ignoretz=True)
    new_datetime = new_datetime.replace(hour=0, minute=0, second=0, microsecond=0)
    return new_datetime

In [5]:
dt_tags = list(dt['news_original_tags'])
dt_tags = [item for sublist in dt_tags for item in sublist]
print(set(dt_tags))

{'law and rights', 'agriculture', 'labour', 'crime', 'science', 'bangladesh', 'safety', 'development', 'foreign affairs', 'environment', 'education', 'politics'}


In [6]:
dsun_tags = set(list(dsun['section']))
print(dsun_tags)

{'/our-faith', '/world-print', '/business-print', '/back-page', '/my-districts', '/asia-print', '/news-link', '/culturetainment', '/editorial', '/front-page', '/winner', '/metropolis'}


In [7]:
dstar_tags = set(list(dstar['section']))
print(dstar_tags)

{'Arts & Entertainment', 'Strategic Issues', 'Opinion', 'City', 'Shift', 'Showbiz', 'Shout', 'The Star', 'Sports', 'Next Step', 'Country', 'Metropolitan', 'In Focus', 'Star Chittagong', 'World', 'Back Page', 'Lifestyle', 'National', 'Business', 'Editorial', 'Literature', 'Front Page', 'Book Reviews', 'Wide Angle', 'Health', 'Star Weekend', 'Bytes', 'Star City', 'Star People', 'Letters', 'Law & Our Rights'}


In [8]:
dt['news_publish_date'] = dt['news_publish_date'].apply(conv_to_datetime)

In [9]:
dstar['date_published'] = dstar['date_published'].apply(conv_to_datetime)

In [10]:
dsun['date_published'] = pd.to_datetime(dsun['date_published'])

In [11]:
start_date = datetime(2013, 7, 8)
end_date = datetime(2016, 6, 14)

In [12]:
dt_new = dt.loc[(dt['news_publish_date']>=start_date) & (dt['news_publish_date']<=end_date)]

In [13]:
dstar_new = dstar.loc[(dstar['date_published']>=start_date) & (dstar['date_published']<=end_date)]

In [14]:
dsun_new = dsun.loc[(dsun['date_published']>=start_date) & (dsun['date_published']<=end_date)]

In [15]:
print("dt: {}, dstar: {}, dsun{}".format(dt_new.shape, dstar.shape, dt.shape))

dt: (46611, 17), dstar: (165236, 34), dsun(49055, 17)


In [16]:
dt_new = dt_new.reset_index(drop=True)
dstar_new = dstar_new.reset_index(drop=True)
dsun_new = dsun_new.reset_index(drop=True)

## Count

In [17]:
dsun_counts = dsun_new.groupby('date_published').sum().agg({
    'location_entities': collections.Counter, 
    'organization_entities': collections.Counter,
    'person_entities': collections.Counter
})

In [18]:
# Sorts Counter object

dsun_counts['location_entities'] = dsun_counts['location_entities'].apply(lambda x: x.most_common())
dsun_counts['organization_entities'] = dsun_counts['organization_entities'].apply(lambda x: x.most_common())
dsun_counts['person_entities'] = dsun_counts['person_entities'].apply(lambda x: x.most_common())

In [19]:
# Daily Star and DT doesn't work the same way for some absurd reason.
# Trying on a small dataset

# df = pd.read_json('Data/DS/test.json', lines=True)
# df['date_published'] = df['date_published'].apply(conv_to_datetime)
# df_counts = df.groupby('date_published').sum().agg({
#     'ner_unique_location': collections.Counter, 
#     'ner_unique_organization': collections.Counter,
#     'ner_unique_person': collections.Counter
# })

In [20]:
dstar_counts = pd.DataFrame()

In [21]:
dstar_counts['location_entities'] = dstar_new.groupby('date_published')['ner_unique_location'].sum().apply(collections.Counter, 1)
dstar_counts['organization_entities'] = dstar_new.groupby('date_published')['ner_unique_organization'].sum().apply(collections.Counter, 1)
dstar_counts['person_entities'] = dstar_new.groupby('date_published')['ner_unique_person'].sum().apply(collections.Counter, 1)

In [22]:
# Sorts Counter object


dstar_counts['location_entities'] = dstar_counts['location_entities'].apply(lambda x: x.most_common())
dstar_counts['organization_entities'] = dstar_counts['organization_entities'].apply(lambda x: x.most_common())
dstar_counts['person_entities'] = dstar_counts['person_entities'].apply(lambda x: x.most_common())

In [23]:
# Splits the news_ner_tags column, which contains dictionaries, into different columns based on dictionary keys
dt_new_modified = pd.concat([dt_new.drop(['news_ner_tags'], axis=1), dt_new['news_ner_tags'].apply(pd.Series)], axis=1)

In [24]:
dt_counts = pd.DataFrame()

In [25]:
dt_counts['location_entities'] = dt_new_modified.groupby('news_publish_date')['locations_unique'].sum().apply(collections.Counter, 1)
dt_counts['organization_entities'] = dt_new_modified.groupby('news_publish_date')['organizations_unique'].sum().apply(collections.Counter, 1)
dt_counts['person_entities'] = dt_new_modified.groupby('news_publish_date')['persons_unique'].sum().apply(collections.Counter, 1)

In [26]:
# Sorts Counter object

dt_counts['location_entities'] = dt_counts['location_entities'].apply(lambda x: x.most_common())
dt_counts['organization_entities'] = dt_counts['organization_entities'].apply(lambda x: x.most_common())
dt_counts['person_entities'] = dt_counts['person_entities'].apply(lambda x: x.most_common())

In [27]:
dt_counts

Unnamed: 0_level_0,location_entities,organization_entities,person_entities
news_publish_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-07-08,"[(Bangladesh, 5), (Dhaka, 4), (Gazipur, 4), (G...","[(Awami League, 6), (BNP, 4), (GCC, 3), (MA Ma...","[(Ershad, 2), (Mannan, 2), (Azmat Ullah Khan, ..."
2013-07-09,"[(Dhaka, 12), (Bangladesh, 10), (Gazipur, 3), ...","[(Awami League, 5), (BNP, 4), (Gazipur City Co...","[(Ershad, 3), (Mizanur Rahman, 2), (Iftekharuz..."
2013-07-10,"[(Dhaka, 7), (Bangladesh, 6), (Gazipur, 3), (C...","[(Awami League, 5), (BNP, 3), (Dhaka Medical C...","[(Sheikh Hasina, 3), (Kalam, 2), (Habibur Rahm..."
2013-07-11,"[(Dhaka, 16), (Bangladesh, 10), (Chittagong, 5...","[(Dhaka Tribune, 10), (Awami League, 5), (FBCC...","[(Azad, 3), (Amir Hossain, 2), (Sheikh Hasina,..."
2013-07-12,"[(Dhaka, 9), (Bangladesh, 6), (Ramna, 2), (Gaz...","[(BNP, 4), (Dhaka University, 4), (Awami Leagu...","[(Shafi, 2), (Ershad, 2), (Ruhul Kabir, 2), (A..."
2013-07-13,"[(Dhaka, 15), (Bangladesh, 11), (Gazipur, 3), ...","[(BNP, 6), (Awami League, 5), (Awami League De...","[(Sheikh Hasina, 4), (Nuh-ul-Alam Lenin, 2), (..."
2013-07-14,"[(Dhaka, 5), (Bangladesh, 5), (Chittagong, 4),...","[(BNP, 3), (PSC, 3), (Bank Company Act-1991, 2...","[(Shafi, 2), (Akash Malik, 2), (Sheikh Hasina,..."
2013-07-15,"[(Dhaka, 11), (Bangladesh, 10), (Chittagong, 4...","[(Jamaat, 9), (BNP, 6), (ICT, 6), (Internation...","[(Ghulam Azam, 13), (Ghulam, 3), (Ganajagaran ..."
2013-07-16,"[(Bangladesh, 9), (Dhaka, 8), (Pakistan, 5), (...","[(Jamaat, 12), (ICT, 5), (Awami League, 4), (I...","[(Ghulam Azam, 13), (Fazle Kabir, 3), (Azam, 3..."
2013-07-17,"[(Bangladesh, 10), (Dhaka, 9), (Pakistan, 7), ...","[(Jamaat, 13), (BNP, 7), (Awami League, 6), (M...","[(Ghulam Azam, 6), (Ali Ahsan Mohammad Mojahee..."


## TF-IDF

In [28]:
tfidf = TfidfVectorizer()

In [29]:
dsun_tfidf = pd.DataFrame()
tqdm.pandas()

In [30]:
# dsun_new['location_entities'] =  dsun_new['location_entities'].progress_apply(lambda x: ' '.join(str(e) for e in x))

In [31]:
# dsun_tfidf['location_entities'] = dsun_new.groupby('date_published').progress_apply(tfidf.fit_transform(dsun_new['location_entities']))

In [32]:
df = pd.DataFrame({'Date': ['2018-09-01','2018-09-01 ','2018-09-02'],
               'documents': [['Dhaka', 'Chittagong', 'Sylhet'],['Dhaka', 'India'],['Dhaka']]})


df['documents'] = df['documents'].apply(lambda x: ' '.join(str(e) for e in x))

In [33]:
df['documents']

0    Dhaka Chittagong Sylhet
1                Dhaka India
2                      Dhaka
Name: documents, dtype: object

In [34]:
v = TfidfVectorizer()

df['documents'] = df.groupby('Date').apply(v.fit_transform(df['documents']))
# x = v.fit_transform(df['documents'])
# df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
# res = pd.concat([df, df1], axis=1)

TypeError: unhashable type: 'csr_matrix'

In [None]:
df.head()