In [60]:
import dotenv
from datasets import load_dataset,Dataset,ClassLabel
from datasets.features import Features,Value
import os
import pandas as pd
import numpy as np

In [2]:
dotenv.load_dotenv(os.path.join(os.path.dirname(os.curdir), '.env'))
data_path = os.getenv("NEWS_DATA_PATH")
columns = ["idx","article_idx","date","year","month","day","author","title","article","url","section","publication"]
features_dict = Features({i: Value("int16")  if i in ["idx","article_idx"] else Value("string") for i in columns})

# All News 2.7M Dataset

In [13]:
df = load_dataset("csv",data_files=[data_path],skiprows=[0,2_324_812],column_names=columns,features=features_dict)

Using custom data configuration default-d341552d477e22b4


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-d341552d477e22b4/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d341552d477e22b4/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
df.push_to_hub("rjac/all-the-news-2-1-Component-one",max_shard_size="250MB",private=False,token=os.getenv("AUTH_TOKEN"))

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/36 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/27 [00:00<?, ?it/s]

# Kaggle 

In [26]:
id2label = {
    0:'O',
    1:'B-PER', 
    2:'I-PER',
    3:'B-ORG',
    4:'I-ORG',
    5:'B-GEO',
    6:'I-GEO',
    7:'B-GPE',
    8:'I-GPE',
    9:'B-TIM',
    10:'I-TIM',
    11:'B-ART',
    12:'I-ART',
    13:'B-EVE',
    14:'I-EVE',
    15:'B-NAT'
    ,16:'I-NAT'
}

label2id = {i:k for k,i in id2label.items()}

In [58]:
labels_name = ['O','B-PER','I-PER','B-ORG','I-ORG','B-GEO','I-GEO','B-GPE','I-GPE','B-TIM','I-TIM','B-ART','I-ART','B-EVE','I-EVE','B-NAT','I-NAT']

In [47]:
data_path = '../data/'
ner = pd.read_csv(data_path + 'ner_dataset.zip', encoding="latin1")

In [48]:
ner["sentence_id"] = ner["Sentence #"].ffill().str.split(":").apply(lambda v: v[-1])
ner["tag"] = ner.Tag.str.upper()
ner["ner_tags"] = ner.tag.map(label2id)
ner = ner.rename({"Word":"tokens"},axis=1)

In [74]:
ner.shape

(1048575, 7)

In [50]:
aggregator = lambda x: list(x)

In [53]:
grouped_df = ner.groupby(["sentence_id"]).agg({"tokens":aggregator,"ner_tags":aggregator}).reset_index()

In [54]:
grouped_df.head(5)

Unnamed: 0,sentence_id,tokens,ner_tags
0,1,"[Thousands, of, demonstrators, have, marched, ...","[0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 5, 0, 0, ..."
1,10,"[Iranian, officials, say, they, expect, to, ge...","[7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,100,"[Helicopter, gunships, Saturday, pounded, mili...","[0, 0, 9, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 3, ..."
3,1000,"[They, left, after, a, tense, hour-long, stand...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[5, 0, 0, 1, 2, 0, 9, 0, 5, 0, 7, 0, 7, 0, 0, ..."


In [55]:
ner_dataset = Dataset.from_pandas(grouped_df)

In [62]:
ner_dataset.features["ner_tags"].feature = ClassLabel(num_classes=17, names=labels_name, id=None)

In [69]:
ner_feature = ner_dataset.features["ner_tags"]

In [70]:
ner_feature

Sequence(feature=ClassLabel(num_classes=17, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-GEO', 'I-GEO', 'B-GPE', 'I-GPE', 'B-TIM', 'I-TIM', 'B-ART', 'I-ART', 'B-EVE', 'I-EVE', 'B-NAT', 'I-NAT'], id=None), length=-1, id=None)

In [72]:
label_names = ner_feature.feature.names
#label_names

In [75]:
ner_dataset.push_to_hub("rjac/kaggle-entity-annotated-corpus-ner-dataset",max_shard_size="5MB",private=False,token=os.getenv("AUTH_TOKEN"))

Pushing dataset shards to the dataset hub:   0%|          | 0/4 [00:00<?, ?it/s]



In [79]:
grouped_df.apply(lambda x: len(x["tokens"]) - len(x["ner_tags"]),axis=1).sum()

0