# Jupyter Notebook for Test / Training Data Preparation for Keras Neural Network

In [1]:
import pandas as pd
import numpy as np
from nlp import preprocessing as pp
from nlp import cleaning as cl

In [2]:
docs = cl.get_docs_one_stop(document_id=True)

In [91]:
print('Total number of documents in One-Stop-English-Corpus:', len(docs))

Total number of documents in One-Stop-English-Corpus: 567


### Note: the docs contain the whole reading as a list

In [4]:
df = pd.DataFrame.from_records(docs, columns=['docs', 'levels', 'id'])
df.head()

Unnamed: 0,docs,levels,id
0,[SeaWorld’s profits fell by 84% and customers ...,Ele,0
1,[Imagine that you read a headline 'Fit in four...,Ele,1
2,[Robert Mysłajek stops. Between two paw prints...,Ele,2
3,[The Taliban sent a gunman to shoot Malala You...,Ele,3
4,[Governments in Europe dream of finding a magi...,Ele,4


In [70]:
one_df = cl.get_one_stop_dataframe()

### Note: documents contain each reading proportioned out into roughly equal segments (See the stats below)

In [102]:
one_df.head()

Unnamed: 0,documents,doc_list,avg_num_words,total_num_sents,total_num_words,words_per_sents,level,doc_id
0,SeaWorld's profits fell by 84% and customers a...,[SeaWorld’s profits fell by 84% and customers ...,25.0,1,25,[25],Ele,0
1,The company teaches dolphins and killer whales...,[The company teaches dolphins and killer whale...,17.0,1,17,[17],Ele,0
2,It says fewer people are going to its parks an...,[It says fewer people are going to its parks a...,13.0,1,13,[13],Ele,0
3,SeaWorld has been in the news since the 2013 d...,[SeaWorld has been in the news since the 2013 ...,33.0,1,33,[33],Ele,0
4,Animal rights organizations say that orcas kep...,[Animal rights organizations say that orcas ke...,14.0,2,28,"[17, 11]",Ele,0


In [309]:
print('Total Number of Segments in One-Stop-Englih-Corpus:', len(one_df))

Total Number of Segments in One-Stop-Englih-Corpus: 7395


In [112]:
one_df.drop('doc_id', axis=1).groupby('level').describe()

Unnamed: 0_level_0,avg_num_words,avg_num_words,avg_num_words,avg_num_words,avg_num_words,avg_num_words,avg_num_words,avg_num_words,total_num_sents,total_num_sents,total_num_sents,total_num_sents,total_num_sents,total_num_words,total_num_words,total_num_words,total_num_words,total_num_words,total_num_words,total_num_words,total_num_words
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Adv,2650.0,24.385211,10.467477,1.0,17.5175,22.67,30.0,131.0,2650.0,2.691698,...,3.0,13.0,2650.0,58.829057,30.04025,1.0,38.0,54.0,76.0,316.0
Ele,2150.0,18.172786,6.830247,1.0,14.0,17.0,21.67,64.0,2150.0,2.782791,...,4.0,12.0,2150.0,47.002791,24.375017,1.0,29.0,44.0,61.0,173.0
Int,2595.0,19.941148,9.233174,1.0,14.75,19.5,25.0,64.0,2595.0,2.633526,...,3.0,13.0,2595.0,49.37341,26.588567,1.0,32.0,47.0,67.0,219.0


In [310]:
def quantiles(col):
    return (col.quantile(0.90), col.quantile(0.95), col.quantile(0.98))

print('Quantiles for Average Number of Words per Segment')
print(' 90:   95:   98:')
print(quantiles(one_df.total_num_words))
print('\n')
print('Quantiles for Average Number of Words per Sentence')
print(' 90:   95:   98:')
print(quantiles(one_df.avg_num_words))

Quantiles for Average Number of Words per Segment
 90:   95:   98:
(88.0, 101.29999999999927, 120.0)


Quantiles for Average Number of Words per Sentence
 90:   95:   98:
(33.0, 38.0, 45.0)


In [312]:
# one_df.drop('doc_id', axis=1).groupby('level').agg(['count', 'mean', 'quantile'])
one_df.drop('doc_id', axis=1).groupby('level').mean()

Unnamed: 0_level_0,avg_num_words,total_num_sents,total_num_words
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adv,24.385211,2.691698,58.829057
Ele,18.172786,2.782791,47.002791
Int,19.941148,2.633526,49.37341


### Below is the function for generating rough equally proportional samples of the segments for testing/training

In [249]:
from numpy.random import default_rng

# Generate random number to proprotionally stratify samples for test/train

rng = default_rng()

sample = []

for i in range(3):
    if i == 0:
        numbers = rng.choice(189, size=20, replace=False) # I also used size = 25 to get more equally number of docs in test set
    else:
        numbers = rng.choice(189, size=20, replace=False)
    for num in numbers:
        sample.append(num + 189*i)

### Final Candidate for the sample list below was second best: See the details below

In [313]:
sample = [95, 36, 38, 149, 158, 31, 42, 55, 91, 143, 65, 46, 79, 39, 172, 11, 165, 147, 68, 128, 104, 45, 157, 84, 6, 261, 335,
          323, 309, 274, 216, 234, 256, 322, 342, 313, 206, 239, 280, 371, 220, 310, 282, 272, 305, 421, 510, 420, 474, 378,
          527, 495, 433, 469, 545, 399, 522, 562, 491, 439, 514, 564, 408, 499, 400]
print(len(sample))

65


In [299]:
test = one_df[one_df.doc_id.isin(sample)]
train = one_df[~one_df.index.isin(test.index.tolist())]

In [300]:
test

Unnamed: 0,documents,doc_list,avg_num_words,total_num_sents,total_num_words,words_per_sents,level,doc_id
69,Police and intelligence agencies around the wo...,[Police and intelligence agencies around the w...,24.00,1,24,[24],Ele,6
70,"But, now, researchers in Britain and the Nethe...","[But, now, researchers in Britain and the Neth...",18.75,4,75,"[23, 14, 20, 18]",Ele,6
71,The polygraph is often used in the US in crimi...,[The polygraph is often used in the US in crim...,16.50,2,33,"[24, 9]",Ele,6
72,The basic idea behind the new method is that l...,[The basic idea behind the new method is that ...,32.00,1,32,[32],Ele,6
73,The new method is over 70% reliable – the poly...,[The new method is over 70% reliable – the pol...,14.00,2,28,"[14, 14]",Ele,6
...,...,...,...,...,...,...,...,...
7367,"After everything was said and done, Hamblin re...","[After everything was said and done, Hamblin r...",25.00,2,50,"[28, 22]",Adv,564
7368,Reducing the frequency of showers (and the num...,[Reducing the frequency of showers (and the nu...,19.00,3,57,"[19, 12, 26]",Adv,564
7369,The vital importance of clean water is becomin...,[The vital importance of clean water is becomi...,23.67,3,71,"[20, 23, 28]",Adv,564
7370,If this whole thing is giving you the heebie-j...,[If this whole thing is giving you the heebie-...,45.50,2,91,"[41, 50]",Adv,564


In [301]:
test.groupby('level').count()

Unnamed: 0_level_0,documents,doc_list,avg_num_words,total_num_sents,total_num_words,words_per_sents,doc_id
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adv,271,271,271,271,271,271,271
Ele,288,288,288,288,288,288,288
Int,286,286,286,286,286,286,286


In [302]:
train.groupby('level').count()

Unnamed: 0_level_0,documents,doc_list,avg_num_words,total_num_sents,total_num_words,words_per_sents,doc_id
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adv,2379,2379,2379,2379,2379,2379,2379
Ele,1862,1862,1862,1862,1862,1862,1862
Int,2309,2309,2309,2309,2309,2309,2309


In [303]:
total_train = sum(c for c in train.groupby('level').count()['documents'])
total_test = sum(c for c in test.groupby('level').count()['documents'])
total = total_train + total_test

print('Train Proportion:', total_train/total)
print('Test Proportion:', total_test/total)

Train Proportion: 0.8857336037863421
Test Proportion: 0.11426639621365788


In [304]:
print('Training Set')
print('Proportion of Adv level:', (train.groupby('level').count()['documents'][0])/total_train)
print('Proportion of Ele level:', (train.groupby('level').count()['documents'][1])/total_train)
print('Proportion of Int level:', (train.groupby('level').count()['documents'][2])/total_train)

Training Set
Proportion of Adv level: 0.36320610687022903
Proportion of Ele level: 0.28427480916030534
Proportion of Int level: 0.35251908396946563


In [305]:
print('Testing Set')
print('Proportion of Adv level:', (test.groupby('level').count()['documents'][0])/total_test)
print('Proportion of Ele level:', (test.groupby('level').count()['documents'][1])/total_test)
print('Proportion of Int level:', (test.groupby('level').count()['documents'][2])/total_test)

Testing Set
Proportion of Adv level: 0.3207100591715976
Proportion of Ele level: 0.3408284023668639
Proportion of Int level: 0.3384615384615385


In [319]:
test.reset_index().to_csv('data/one_stop_test.csv', index=False)
train.reset_index().to_csv('data/one_stop_train.csv', index=False)

#### Some sample lists in which I choose the best proportions from

In [289]:
# best_sample = sample
# second_best = best_sample
# new_best = sample
# new = sample

In [247]:
# Good samples
print(best_sample)
print(second_best)
print(new_best)
print(new)

[182, 45, 54, 87, 146, 137, 26, 40, 55, 62, 163, 47, 131, 64, 101, 148, 73, 49, 135, 177, 103, 102, 110, 124, 39, 235, 224, 345, 295, 327, 249, 284, 257, 318, 374, 293, 366, 317, 335, 301, 210, 262, 313, 195, 266, 528, 459, 485, 458, 524, 394, 561, 460, 411, 517, 452, 494, 420, 544, 392, 562, 412, 525, 500, 557]
[95, 36, 38, 149, 158, 31, 42, 55, 91, 143, 65, 46, 79, 39, 172, 11, 165, 147, 68, 128, 104, 45, 157, 84, 6, 261, 335, 323, 309, 274, 216, 234, 256, 322, 342, 313, 206, 239, 280, 371, 220, 310, 282, 272, 305, 421, 510, 420, 474, 378, 527, 495, 433, 469, 545, 399, 522, 562, 491, 439, 514, 564, 408, 499, 400]
[79, 154, 146, 113, 11, 182, 12, 172, 98, 129, 60, 109, 180, 36, 156, 131, 59, 61, 49, 144, 287, 216, 307, 359, 243, 196, 229, 227, 331, 377, 212, 207, 202, 274, 218, 366, 365, 315, 220, 309, 540, 462, 430, 469, 424, 505, 380, 405, 516, 485, 436, 534, 435, 524, 389, 493, 471, 492, 556, 563]
[33, 128, 144, 146, 19, 88, 95, 185, 183, 153, 59, 154, 75, 132, 53, 79, 107, 160, 15

In [248]:
# Sample sizes
print(len(best_sample))
print(len(second_best))
print(len(new_best))
print(len(new))

65
65
60
60
