In [1]:
import os
import csv
import json_lines
from collections import Counter

In [2]:
import re
import spacy
import random
import json_lines
import json

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [4]:
input_data_dir = '/home/aorus/workspaces/simge/corpus/Sentence/Newindianexpress/'
all_data_path = os.path.join(input_data_dir,
                             '20181001-newindianexpress_sentence_classification_adjudicated_20181218.json')

In [7]:
all_data = []
with json_lines.open(all_data_path) as reader:
    for obj in reader:
        all_data.append(obj)
        
print('Number of total sentences:', len(all_data))

Number of total sentences: 8337


In [6]:
index_dict = {}
for row in all_data:
    
    doc_num, sent_num = row['sent_num'].split('-')
    
    sent_text = re.sub("http\S*\s", "", row['sentence'])
    sent_text = re.sub("((url)*\s*:\s*)*http\S*", "", sent_text)
    
    if not sent_text == "":
        
        sent = {'url': row['url'],
                'sent_num': row['sent_num'],
                'sentence': sent_text,
                'label': int(row['label'])}

        if doc_num not in index_dict:
            index_dict[doc_num] = {}

        index_dict[doc_num][sent_num] = sent

print('Number of articles:', len(index_dict.keys()))

Number of articles: 587


In [7]:
articles = []

for k, v in index_dict.items():
    
    article = {'id': 'a.' + k,
               'url' : v['1']['url'],
               'sentences': []}
    
    for _, sent in v.items():
        
        sent = {'sent_num': sent['sent_num'],
                'sentence': sent['sentence'],
                'label': sent['label']}
        
        article['sentences'].append(sent) 
    
    articles.append(article)

print('Number of articles:', len(articles))

Number of articles: 587


In [8]:
for article in articles:
    article['label_set'] = list({sent['label'] for sent in article['sentences']})

In [9]:
i = 0
for article in articles:
    if 2 in article['label_set'] and 1 not in article['label_set']:
        i += 1
print('Articles with only {0,2}:', i)

Articles with only {0,2}: 43


In [10]:
i = 0
for article in articles:
    if 1 not in article['label_set']:
        i += 1
print('Articles with at least 1 in label set:', len(articles) - i)

Articles with at least 1 in label set: 315


In [11]:
# Remove articles that does not contain any label '1'
articles = [article for article in articles if 1 in article['label_set']]
print('Number of articles:', len(articles))

Number of articles: 315


In [35]:
random.shuffle(articles)

In [36]:
# Split rates
test = 32
valid = 32
train = len(articles) - test - valid
print('Train\t:', train,
      '\nValid\t:', valid,
      '\nTest\t:', test)

Train	: 251 
Valid	: 32 
Test	: 32


In [37]:
thesis_train_data = articles[:train]
thesis_valid_data = articles[train:train+valid]
thesis_test_data = articles[train+valid:]

print('Number of articles;',
      '\nTrain\t:', len(thesis_train_data),
      '\nValid\t:', len(thesis_valid_data),
      '\nTest\t:',len(thesis_test_data))

Number of articles; 
Train	: 251 
Valid	: 32 
Test	: 32


In [38]:
thesis_train_label_list = [sent['label'] for art in thesis_train_data for sent in art['sentences']]
thesis_valid_label_list = [sent['label'] for art in thesis_valid_data for sent in art['sentences']]
thesis_test_label_list = [sent['label'] for art in thesis_test_data for sent in art['sentences']]

print('Number of sentences;',
      '\nTrain\t:', len(thesis_train_label_list),
      '\nValid\t:', len(thesis_valid_label_list),
      '\nTest\t:', len(thesis_test_label_list))

Number of sentences; 
Train	: 3582 
Valid	: 399 
Test	: 441


In [39]:
tr_c = Counter(thesis_train_label_list)
va_c = Counter(thesis_valid_label_list)
te_c = Counter(thesis_test_label_list)

In [40]:
print('Sentence label distributions')

for v in [0, 1, 2]:
    
    print('\n-- Label: ', v, '--',    
          '\nTrain\t:', tr_c[v],
          '\nValid\t:', va_c[v],
          '\nTest\t:', te_c[v])

Sentence label distributions

-- Label:  0 -- 
Train	: 2472 
Valid	: 268 
Test	: 320

-- Label:  1 -- 
Train	: 1053 
Valid	: 130 
Test	: 116

-- Label:  2 -- 
Train	: 57 
Valid	: 1 
Test	: 5


In [48]:
all_data = {
    'metadata' : {
        'raw_data_path': all_data_path,
        'split_ratio': {
            'train': 80,
            'valid': 10,
            'test': 10
        },
        'article_nums': {
            'train': len(thesis_train_data),
            'valid': len(thesis_valid_data),
            'test': len(thesis_test_data)         
        },
        'sentence_nums': {
            'train': len(thesis_train_label_list),
            'valid': len(thesis_valid_label_list),
            'test': len(thesis_test_label_list)            
        },
        'label_nums': {
            'train': {0: tr_c[0], 1: tr_c[1], 2: tr_c[2]},
            'valid': {0: va_c[0], 1: va_c[1], 2: va_c[2]},
            'test': {0: te_c[0], 1: te_c[1], 2: te_c[2]}            
        }
    },
    'data': {
        'train': thesis_train_data,
        'valid': thesis_valid_data,
        'test': thesis_test_data
    }
}

In [49]:
all_data['metadata']

{'raw_data_path': '/home/aorus/workspaces/simge/corpus/Sentence/Newindianexpress/20181001-newindianexpress_sentence_classification_adjudicated_20181218.json',
 'split_ratio': {'train': 80, 'valid': 10, 'test': 10},
 'article_nums': {'train': 251, 'valid': 32, 'test': 32},
 'sentence_nums': {'train': 3582, 'valid': 399, 'test': 441},
 'label_nums': {'train': {0: 2472, 1: 1053, 2: 57},
  'valid': {0: 268, 1: 130, 2: 1},
  'test': {0: 320, 1: 116, 2: 5}}}

### SAVE 
###### (!!! Change the Run Number below)

In [46]:
run_num = 'shuffle-1'