In [1]:
import pandas as pd
from tqdm import tqdm
import re
import spacy
from tqdm import tqdm
from unidecode import unidecode
from dateutil import parser
from datetime import datetime
import collections
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en')

In [2]:
def name_disambiguation(name):
    if (name == "Khaleda" or name == "Zia" or name == "Begum Khaleda Zia"):
        return "Khaleda Zia"
    if (name == "Hasina" or name == "Sheikh"):
        return "Sheikh Hasina"
    if (name == "Fakhrul"):
        return "Mirza Fakhrul Islam Alamgir"
    if (name == "Muhith" or name == "AMA Muhith" or name == "MA Muhith"):
        return "Abul Maal Abdul Muhith"
    if (name == "Nizami" or name == "Motiur Rahman"):
        return "Motiur Rahman Nizami"
    if (name == "Modi"):
        return "Narendra Modi"
    if (name == "Bangabandhu" or name == "Sheikh Mujib" or name == "Sheikh Mujib" or name == "Bangabandhu Sheikh Mujibur" or name == "Sheikh Mujibur Rahman"):
        return "Bangabandhu Sheikh Mujibur Rahman"
    if (name == "Tarique"):
        return "Tarique Rahman"
    if (name == "Avijit"):
        return "Avijit Roy"
    if (name == "Mozena"):
        return "Dan Mozena"
    if (name == "Yunus" or name == "Mohammad Yunus"):
        return "Muhammad Yunus"

### Date limit: 2013-07-08 to 2016-06-14 on DT, Daily Star and Daily Sun

In [3]:
dt = pd.read_json('Data/DT/bd_news_dt.json')
dstar = pd.read_json('Data/DS/news_db.json', lines=True)
dsun = pd.read_pickle('Data/Daily Sun/DailySun_ent_1.pkl')

In [4]:
def conv_to_datetime(date_dict):
    new_datetime = parser.parse(list(date_dict.items())[0][1], ignoretz=True)
    new_datetime = new_datetime.replace(hour=0, minute=0, second=0, microsecond=0)
    return new_datetime

In [5]:
dt_tags = list(dt['news_original_tags'])
dt_tags = [item for sublist in dt_tags for item in sublist]
print(set(dt_tags))

{'bangladesh', 'politics', 'law and rights', 'safety', 'labour', 'development', 'education', 'environment', 'foreign affairs', 'crime', 'agriculture', 'science'}


In [6]:
dsun_tags = set(list(dsun['section']))
print(dsun_tags)

{'/back-page', '/my-districts', '/winner', '/culturetainment', '/metropolis', '/our-faith', '/news-link', '/asia-print', '/editorial', '/business-print', '/world-print', '/front-page'}


In [7]:
dstar_tags = set(list(dstar['section']))
print(dstar_tags)

{'Star Chittagong', 'National', 'World', 'Opinion', 'Showbiz', 'Letters', 'The Star', 'Lifestyle', 'Health', 'Sports', 'Bytes', 'Star Weekend', 'Next Step', 'In Focus', 'Wide Angle', 'Editorial', 'Law & Our Rights', 'Arts & Entertainment', 'Metropolitan', 'Shout', 'Shift', 'Star City', 'Back Page', 'Front Page', 'Strategic Issues', 'Country', 'Star People', 'Business', 'City', 'Literature', 'Book Reviews'}


In [8]:
dt['news_publish_date'] = dt['news_publish_date'].apply(conv_to_datetime)

In [9]:
dstar['date_published'] = dstar['date_published'].apply(conv_to_datetime)

In [10]:
dsun['date_published'] = pd.to_datetime(dsun['date_published'])

In [11]:
start_date = datetime(2013, 7, 8)
end_date = datetime(2016, 6, 14)

In [12]:
dt_new = dt.loc[(dt['news_publish_date']>=start_date) & (dt['news_publish_date']<=end_date)]

In [13]:
dstar_new = dstar.loc[(dstar['date_published']>=start_date) & (dstar['date_published']<=end_date)]

In [14]:
dsun_new = dsun.loc[(dsun['date_published']>=start_date) & (dsun['date_published']<=end_date)]

In [15]:
print("dt: {}, dstar: {}, dsun{}".format(dt_new.shape, dstar.shape, dt.shape))

dt: (46611, 17), dstar: (165236, 34), dsun(49055, 17)


In [16]:
dt_new = dt_new.reset_index(drop=True)
dstar_new = dstar_new.reset_index(drop=True)
dsun_new = dsun_new.reset_index(drop=True)

## Count

In [27]:
dsun_counts = dsun_new.groupby('date_published').sum().agg({
    'location_entities': collections.Counter, 
    'organization_entities': collections.Counter,
    'person_entities': collections.Counter
})

In [28]:
# Sorts Counter object

dsun_counts['location_entities'] = dsun_counts['location_entities'].apply(lambda x: x.most_common())
dsun_counts['organization_entities'] = dsun_counts['organization_entities'].apply(lambda x: x.most_common())
dsun_counts['person_entities'] = dsun_counts['person_entities'].apply(lambda x: x.most_common())

In [29]:
# Daily Star and DT doesn't work the same way for some absurd reason.
# Trying on a small dataset

# df = pd.read_json('Data/DS/test.json', lines=True)
# df['date_published'] = df['date_published'].apply(conv_to_datetime)
# df_counts = df.groupby('date_published').sum().agg({
#     'ner_unique_location': collections.Counter, 
#     'ner_unique_organization': collections.Counter,
#     'ner_unique_person': collections.Counter
# })

In [30]:
dstar_counts = pd.DataFrame()

In [31]:
dstar_counts['location_entities'] = dstar_new.groupby('date_published')['ner_unique_location'].sum().apply(collections.Counter, 1)
dstar_counts['organization_entities'] = dstar_new.groupby('date_published')['ner_unique_organization'].sum().apply(collections.Counter, 1)
dstar_counts['person_entities'] = dstar_new.groupby('date_published')['ner_unique_person'].sum().apply(collections.Counter, 1)

In [32]:
# Sorts Counter object


dstar_counts['location_entities'] = dstar_counts['location_entities'].apply(lambda x: x.most_common())
dstar_counts['organization_entities'] = dstar_counts['organization_entities'].apply(lambda x: x.most_common())
dstar_counts['person_entities'] = dstar_counts['person_entities'].apply(lambda x: x.most_common())

In [33]:
# Splits the news_ner_tags column, which contains dictionaries, into different columns based on dictionary keys
dt_new_modified = pd.concat([dt_new.drop(['news_ner_tags'], axis=1), dt_new['news_ner_tags'].apply(pd.Series)], axis=1)

In [34]:
dt_counts = pd.DataFrame()

In [35]:
dt_counts['location_entities'] = dt_new_modified.groupby('news_publish_date')['locations_unique'].sum().apply(collections.Counter, 1)
dt_counts['organization_entities'] = dt_new_modified.groupby('news_publish_date')['organizations_unique'].sum().apply(collections.Counter, 1)
dt_counts['person_entities'] = dt_new_modified.groupby('news_publish_date')['persons_unique'].sum().apply(collections.Counter, 1)

In [26]:
# Sorts Counter object

# dt_counts['location_entities'] = dt_counts['location_entities'].apply(lambda x: x.most_common())
# dt_counts['organization_entities'] = dt_counts['organization_entities'].apply(lambda x: x.most_common())
# dt_counts['person_entities'] = dt_counts['person_entities'].apply(lambda x: x.most_common())

In [39]:
# dt_counts.to_json('Data/DT/dt_counts.json')
# dstar_counts.to_json('Data/DS/dstar_counts.json')
# dsun_counts.to_json('Data/Daily Sun/dsun_counts.json')

## TF-IDF

In [28]:
tfidf = TfidfVectorizer()

In [29]:
dsun_tfidf = pd.DataFrame()
tqdm.pandas()

In [30]:
# dsun_new['location_entities'] =  dsun_new['location_entities'].progress_apply(lambda x: ' '.join(str(e) for e in x))

In [31]:
# dsun_tfidf['location_entities'] = dsun_new.groupby('date_published').progress_apply(tfidf.fit_transform(dsun_new['location_entities']))

In [32]:
df = pd.DataFrame({'Date': ['2018-09-01','2018-09-01 ','2018-09-02'],
               'documents': [['Dhaka', 'Chittagong', 'Sylhet'],['Dhaka', 'India'],['Dhaka']]})


df['documents'] = df['documents'].apply(lambda x: ' '.join(str(e) for e in x))

In [33]:
df['documents']

0    Dhaka Chittagong Sylhet
1                Dhaka India
2                      Dhaka
Name: documents, dtype: object

In [34]:
v = TfidfVectorizer()

df['documents'] = df.groupby('Date').apply(v.fit_transform(df['documents']))
# x = v.fit_transform(df['documents'])
# df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
# res = pd.concat([df, df1], axis=1)

TypeError: unhashable type: 'csr_matrix'

In [None]:
df.head()