# Imports

In [31]:
import glob
import numpy as np
import pandas as pd
from importlib import reload
import random
import logging as logger
logger.basicConfig(format="%(asctime)s %(message)s", level=logger.DEBUG,filename="/Users/emrecalisir/Documents/logs/out.log")

# Functions generating the training and prediction files that will be used as input to the BERT Sentence Comparison

In [32]:
# this func is used while the training input file of BERT is being produced
def create_groundtruths_from_different_segments(couple1, couple2):
    couples = []
    
    for i in range(0,len(couple1)):
        for j in range(0, len(couple2)):
            couples.append((couple1[i],couple2[j],False))            
    return couples

In [33]:
# this func creates a randomly sampled and balanced dataset from true and false examples in a format which BERT can understand
# the files should contain the segment boundary characters: ==========, otherwise it can't discriminate the segment boundaries
# It's used to train the BERT model  
def func_convert_files_to_bert_input_df_for_train(path,sample_size,outfile):
    files = []
    for filename in glob.glob(path):
        with open(filename) as f: 
            lines = [line.rstrip() for line in f]
            lines = lines[0:len(lines)-1]
            #lines.append("#######")
            files.append(lines)
            
    arr = np.hstack(files)
    lines = arr.tolist()
    df = pd.DataFrame(lines, columns=["lines"])
    logger.info("DataFrame ready from the given files")
    print("DataFrame ready from the given files")
    print(df.shape)
    logger.info(df.shape)
    segment_start_indexes = np.where(df["lines"]=="==========")
    segment_index_list = segment_start_indexes[0].tolist()
    
    list_segment_start_end_indexes = []
    for i in range(0,len(segment_index_list)):
        for j in range(i, len(segment_index_list)):
            if i==j:
                continue
            list_segment_start_end_indexes.append((segment_index_list[i],segment_index_list[j]))        
            break
            
    list_segment_filled_numerics = []
    for (x,y) in list_segment_start_end_indexes:
        numerics = []
        for i in range(x+1,y):
            numerics.append(i)
        list_segment_filled_numerics.append(numerics)
    
        random.shuffle(list_segment_filled_numerics)
        
    couples=[]
    for i in range(0,len(list_segment_filled_numerics)):
        if(i%100==0):
            logger.info("create_groundtruths_from_same_segments started for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))
        couple1 = list_segment_filled_numerics[i]
        #print(couple1)
        for k in range(0,len(couple1)):
            for m in range(k, len(couple1)):
                if(k==m):
                    continue
                couples.append((couple1[k],couple1[m],True))
        if(i%100==0):
            logger.info("create_groundtruths_from_same_segments completed for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))
            logger.info("create_groundtruths_from_different_segments started for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))

        ## To generate couples of sentences from different segments as possible, we apply some custom logic
        cnt = 0
        for j in range(i, len(list_segment_filled_numerics)):
            if(i==j):
                continue
            if((j-i)<20):
                #if it is slow on your machine, remove this condition
                continue
            if(cnt>40):
                ## 40 records per iteration is enough to create false labeled couples, if this condition is not set, there will be millions of false labeled couples which is no-sense
                break
            ##if(j%5==0): 
                ## skip 10 records to generate more distributed false labeled couples
            res = create_groundtruths_from_different_segments(list_segment_filled_numerics[i], list_segment_filled_numerics[j])                                            
            couples = couples + res
            cnt+=1
        if(i%100==0):
            logger.info("create_groundtruths_from_different_segments completed for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))

    col_first_sentence_id = []
    col_second_sentence_id = []
    col_first_sentence = []
    col_second_sentence= []
    col_two_sentences_in_same_segment = []

    for sen1,sen2,is_same in couples:
        col_first_sentence_id.append(sen1)
        col_second_sentence_id.append(sen2)
        col_first_sentence.append(lines[sen1])
        col_second_sentence.append(lines[sen2])
        col_two_sentences_in_same_segment.append(is_same)

    for i in range(len(col_first_sentence_id)):
        if(i%100==0):
            logger.info(str(col_first_sentence_id[i])+","+str(col_second_sentence_id[i])+","+str(col_first_sentence[i])+","+str(col_second_sentence[i])+","+str(col_two_sentences_in_same_segment[i]))

    data_tuples = list(zip(col_first_sentence_id,col_second_sentence_id,col_first_sentence,col_second_sentence,col_two_sentences_in_same_segment))
    
    df = pd.DataFrame(data_tuples, columns=["first_sentence_id","second_sentence_id","first_sentence","second_sentence","two_sentences_in_same_segment"])
    
    df_tmp = df[df["two_sentences_in_same_segment"]==True]
    df_true_size = len(df_tmp.index)
    
    print("df_true_size: " + str(df_true_size))
    logger.info("df_true_size: " + str(df_true_size))
    
    df_tmp = df[df["two_sentences_in_same_segment"]==False]
    df_false_size = len(df_tmp.index)
    
    print("df_false_size: " + str(df_false_size))
    logger.info("df_false_size: " + str(df_false_size))
    
    df_filter_size = 0
    
    if df_true_size<df_false_size:
        df_filter_size = df_true_size
    else:
        df_filter_size = df_false_size
    
    
    print("df_filter_size: " + str(df_filter_size))
    logger.info("df_filter_size: " + str(df_filter_size))
        
    df_true = df[df["two_sentences_in_same_segment"]==True].sample(n=df_filter_size) 

    df_false = df[df["two_sentences_in_same_segment"]==False].sample(n=df_filter_size) 

    df_balanced = pd.concat([df_true,df_false])

    df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

    df_balanced.to_pickle(outfile)
        

In [28]:
func_convert_files_to_bert_input_df_for_train("/Users/emrecalisir/Documents/data/wiki_italian/dev_train_test/*",sample_size=200000,outfile="/Users/emrecalisir/Documents/data/wiki_italian/out.pickle")

DataFrame ready from the given files
(20986, 1)
df_true_size: 39108
df_false_size: 1810262
df_filter_size: 39108


In [110]:
# this func prepares the dataframe to be used as input to the BERT model's prediction function
# it should not contain any segment boundary characters, such as ==========
# It's the BERT model which will generate the segment boundaries 
def func_convert_files_to_bert_input_df_for_pred(path, out_file):
    all_sentences = None
    num_total_segments = 0
    for filename in glob.glob(path):
        with open(filename) as f: 
            lines = [line.rstrip() for line in f]
            lines = lines[0:len(lines)-1]
            line_ids = range(len(lines))
            splits = filename.split("/")
            line_ids = [splits[len(splits)-1]+"_"+str(line_id) for line_id in line_ids]
            line_ids = np.asarray(line_ids)
            lines = np.asarray(lines)
            combined = np.vstack((line_ids, lines)).T
            boundaries = [elem for elem in lines if elem == '==========']
            num_total_segments += len(boundaries)
            if all_sentences is None:
                all_sentences = combined
            else:
                all_sentences = np.concatenate((all_sentences, combined))

    dd = pd.DataFrame({'myid': all_sentences[:, 0], 'sentence': all_sentences[:, 1]})

    prev_sentence = None
    prev_id = None
    new_segment_started = False
    col_first_sentence_id = []
    col_first_sentence = []
    col_second_sentence_id = []
    col_second_sentence = []
    col_two_sentences_in_same_segment = []
    for index, row in dd.iterrows():
        if prev_sentence is None and prev_id is None:
            if row['sentence'] == "==========":
                continue
            prev_sentence = row['sentence']
            prev_id = row['myid']
        else:
            current_sentence = row['sentence']
            current_id = row['myid']
            if current_sentence == "==========":
                new_segment_started = True
                continue

            if not new_segment_started:
                new_segment_started = False
                col_first_sentence_id.append(prev_id)
                col_first_sentence.append(prev_sentence)
                col_second_sentence_id.append(current_id)
                col_second_sentence.append(current_sentence)
                col_two_sentences_in_same_segment.append(True)
            else:
                col_first_sentence_id.append(prev_id)
                col_first_sentence.append(prev_sentence)
                col_second_sentence_id.append(current_id)
                col_second_sentence.append(current_sentence)
                col_two_sentences_in_same_segment.append(False)
                new_segment_started = False

            prev_id = current_id
            prev_sentence = current_sentence

    data_tuples = list(zip(col_first_sentence_id,col_second_sentence_id,col_first_sentence,col_second_sentence,col_two_sentences_in_same_segment))

    df = pd.DataFrame(data_tuples, columns=["first_sentence_id","second_sentence_id","first_sentence","second_sentence","two_sentences_in_same_segment"]) 
    df.to_pickle(out_file)
    print(df.two_sentences_in_same_segment.value_counts())
    return df
        


In [111]:
df_real_test = func_convert_files_to_bert_input_df_for_pred("/Users/emrecalisir/Documents/data/wiki_italian/real_test/*","/Users/emrecalisir/Documents/data/italian_wiki_real_test.pickle")

True     2390
False    1241
Name: two_sentences_in_same_segment, dtype: int64


In [115]:
df_real_test.shape

(3631, 5)

In [113]:
df_real_test[0:30]

Unnamed: 0,first_sentence_id,second_sentence_id,first_sentence,second_sentence,two_sentences_in_same_segment
0,3320.txt_1,3320.txt_3,È passato alla storia per aver soppresso l'ord...,"Nativo del Bazadais, in Guascogna, era figlio ...",False
1,3320.txt_3,3320.txt_4,"Nativo del Bazadais, in Guascogna, era figlio ...",A Lione Bertrand fu vicario generale del frate...,True
2,3320.txt_4,3320.txt_6,A Lione Bertrand fu vicario generale del frate...,Da arcivescovo venne eletto papa il 5 giugno 1...,False
3,3320.txt_6,3320.txt_8,Da arcivescovo venne eletto papa il 5 giugno 1...,"Invece di ritornare a Roma, che allora era dil...",False
4,3320.txt_8,3320.txt_9,"Invece di ritornare a Roma, che allora era dil...","Bertrand non era né italiano né cardinale, e l...",True
5,3320.txt_9,3320.txt_11,"Bertrand non era né italiano né cardinale, e l...","All'inizio del 1306, Clemente abrogò di fatto ...",False
6,3320.txt_11,3320.txt_12,"All'inizio del 1306, Clemente abrogò di fatto ...",Il 13 ottobre 1307 fu ordinato l'arresto di tu...,True
7,3320.txt_12,3320.txt_13,Il 13 ottobre 1307 fu ordinato l'arresto di tu...,"L'8 settembre 1308, con la Bolla papale ""Super...",True
8,3320.txt_13,3320.txt_14,"L'8 settembre 1308, con la Bolla papale ""Super...",Approfittando della sua influenza sul pontefic...,True
9,3320.txt_14,3320.txt_15,Approfittando della sua influenza sul pontefic...,"Nel perseguimento dei desideri del re, Clement...",True


# Jupyter Notebook Cells to waltkthrough the codes that are divided by cells

In [2]:
## check that filehandler is well set, otherwise, run the removeHandler function existing in the next cell
logger.getLoggerClass().root.handlers

[<FileHandler /Users/emrecalisir/git/bert/out.log (NOTSET)>]

In [55]:
for handler in logger.root.handlers[:]:
    logger.root.removeHandler(handler)

In [5]:
logger.info("test from world")

In [4]:
path = "/Users/emrecalisir/git/data/wiki/wiki_test_*.ref"
for filename in glob.glob(path):
    with open(filename) as f:
        cnt = 0
        for line in f:
            cnt += 1
            if(cnt==10):
                break
            print(line)


Pádraig Ó Caoimh

Pádraig Ó Caoimh (Patrick "Paddy" O'Keeffe) was an Irish soldier and long-time administrator of the Gaelic Athletic Association (GAA).

Páirc Uí Chaoimh, the home of the Cork GAA, is named after him.


Ó Caoimh was born in Roscommon in 1898.

He moved to Cork City at an early age; the 1911 census records him living at No.

13 East View Terrace on the Quaker Road.

He lived with his father (an RIC Pensioner and Draper), his brother and his three sisters.


Alexander Vinokourov

Alexander Nikolaevich Vinokourov (; born 16 September 1973) is a Kazakh former professional road bicycle racer and current general manager of UCI ProTeam .

As a competitor, his achievements include two bronze medals at the World Championships, four stage wins in the Tour de France, four in the Vuelta a España plus the overall title in 2006, two Liège–Bastogne–Liège monuments, one Amstel Gold Race, and most recently, the gold medal at the 2012 London Olympics Men's Road Race.

Vinokourov is a p


Mito Domain

In the han system, Mito was a political and economic abstraction based on periodic cadastral surveys and projected agricultural yields.

In other words, the domain was defined in terms of "kokudaka", not land area.

This was different from the feudalism of the West.


The domain's capital was the city of Mito.

Beginning with the appointment of Tokugawa Yorifusa by his father, Shogun Tokugawa Ieyasu, in 1608, the Mito branch of the Tokugawa clan controlled the domain until the abolition of the han system in 1871.

During the Edo period, Mito represented the center of nativism largely as a result of the Mitogaku, an influential school of Japanese thought, which advanced the political philosophy of sonnō jōi ("revere the emperor, expel the barbarians") that had become a popular sentiment after 1854.


Woodlands, Marburg

Woodlands is a heritage-listed mansion at Seminary Road, Marburg, City of Ipswich, Queensland, Australia.

It was designed by George Brockwell Gill and bui

In [5]:
files = []

import glob
path = "/Users/emrecalisir/git/data/wiki/wiki_test_*.ref"
cnt = 0
for filename in glob.glob(path):
    with open(filename) as f: 
        lines = [line.rstrip() for line in f]
        lines = lines[0:len(lines)-1]
        #lines.append("#######")
        files.append(lines)
        logger.info(filename)
        cnt+=1 

logger.info("Completed reading " + str(cnt) + " files")

In [6]:
len(files)

300

In [7]:
files[0:2]

  'Pádraig Ó Caoimh',
  'Pádraig Ó Caoimh (Patrick "Paddy" O\'Keeffe) was an Irish soldier and long-time administrator of the Gaelic Athletic Association (GAA).',
  'Páirc Uí Chaoimh, the home of the Cork GAA, is named after him.',
  'Ó Caoimh was born in Roscommon in 1898.',
  'He moved to Cork City at an early age; the 1911 census records him living at No.',
  '13 East View Terrace on the Quaker Road.',
  'He lived with his father (an RIC Pensioner and Draper), his brother and his three sisters.',
  'One of these siblings, or some other relatives, would later live in Ballynoe, near Castlelyons, in County Cork.',
  'He was educated by the Christian Brothers in Cork and was an active member of the Gaelic Athletic Association.',
  "After leaving school he trained as a Secondary School teacher at Saint Mary's College in London, returning to Cork to teach at Presentation Brothers College.",
  'In 1916, when he was 18, he joined the Irish Volunteers.',
  'In 1919, at the age of 21, Ó Caoim

In [8]:
arr = np.hstack(files)
arr.shape

(60843,)

In [9]:
lines = arr.tolist()

In [10]:
df = pd.DataFrame(lines, columns=["lines"])
df.head()

Unnamed: 0,lines
0,==========
1,Pádraig Ó Caoimh
2,"Pádraig Ó Caoimh (Patrick ""Paddy"" O'Keeffe) wa..."
3,"Páirc Uí Chaoimh, the home of the Cork GAA, is..."
4,==========


In [11]:
df[0:100]

Unnamed: 0,lines
0,==========
1,Pádraig Ó Caoimh
2,"Pádraig Ó Caoimh (Patrick ""Paddy"" O'Keeffe) wa..."
3,"Páirc Uí Chaoimh, the home of the Cork GAA, is..."
4,==========
...,...
95,"An obituary in ""The Irish Times' stated that ""..."
96,"During his term of office, its membership grew..."
97,Without his ability to co-ordinate plan and in...
98,==========


In [29]:
lines = df.lines.tolist()
lines[0:15]

NameError: name 'df' is not defined

In [13]:
print(len(lines))
lines[97:101]

60843


['Without his ability to co-ordinate plan and inspire, only a fraction of this extraordinary progress could have been achieved."',
 'Alexander Vinokourov',
 'Alexander Nikolaevich Vinokourov (; born 16 September 1973) is a Kazakh former professional road bicycle racer and current general manager of UCI ProTeam .']

## Ground-truth Segment Boundaries

In [14]:
segment_start_indexes = np.where(df["lines"]=="==========")
segment_index_list = segment_start_indexes[0].tolist()
segment_index_list

[0,
 4,
 14,
 66,
 98,
 111,
 121,
 132,
 149,
 241,
 272,
 284,
 336,
 370,
 386,
 388,
 404,
 429,
 457,
 484,
 516,
 548,
 572,
 597,
 604,
 618,
 719,
 725,
 736,
 741,
 756,
 814,
 822,
 869,
 874,
 935,
 959,
 967,
 972,
 985,
 988,
 1019,
 1078,
 1131,
 1202,
 1255,
 1298,
 1361,
 1390,
 1403,
 1441,
 1452,
 1464,
 1481,
 1492,
 1522,
 1564,
 1596,
 1614,
 1646,
 1671,
 1715,
 1759,
 1804,
 1842,
 1847,
 1875,
 1907,
 1925,
 1946,
 1981,
 1993,
 2020,
 2037,
 2067,
 2076,
 2086,
 2094,
 2124,
 2174,
 2185,
 2203,
 2257,
 2289,
 2302,
 2341,
 2368,
 2391,
 2413,
 2417,
 2430,
 2437,
 2499,
 2502,
 2529,
 2543,
 2562,
 2575,
 2657,
 2670,
 2683,
 2718,
 2735,
 2755,
 2771,
 2859,
 2866,
 2909,
 2914,
 2950,
 2969,
 2976,
 3024,
 3041,
 3056,
 3083,
 3102,
 3129,
 3148,
 3180,
 3218,
 3292,
 3325,
 3342,
 3349,
 3353,
 3356,
 3371,
 3378,
 3383,
 3388,
 3391,
 3408,
 3438,
 3463,
 3502,
 3532,
 3563,
 3599,
 3634,
 3672,
 3706,
 3737,
 3779,
 3781,
 3787,
 3792,
 3803,
 3874,
 3886

## Intermediate step to Generate Couples

In [15]:
list_segment_start_end_indexes = []
for i in range(0,len(segment_index_list)):
    for j in range(i, len(segment_index_list)):
        if i==j:
            continue
        list_segment_start_end_indexes.append((segment_index_list[i],segment_index_list[j]))        
        break

In [16]:
list_segment_start_end_indexes

[(0, 4),
 (4, 14),
 (14, 66),
 (66, 98),
 (98, 111),
 (111, 121),
 (121, 132),
 (132, 149),
 (149, 241),
 (241, 272),
 (272, 284),
 (284, 336),
 (336, 370),
 (370, 386),
 (386, 388),
 (388, 404),
 (404, 429),
 (429, 457),
 (457, 484),
 (484, 516),
 (516, 548),
 (548, 572),
 (572, 597),
 (597, 604),
 (604, 618),
 (618, 719),
 (719, 725),
 (725, 736),
 (736, 741),
 (741, 756),
 (756, 814),
 (814, 822),
 (822, 869),
 (869, 874),
 (874, 935),
 (935, 959),
 (959, 967),
 (967, 972),
 (972, 985),
 (985, 988),
 (988, 1019),
 (1019, 1078),
 (1078, 1131),
 (1131, 1202),
 (1202, 1255),
 (1255, 1298),
 (1298, 1361),
 (1361, 1390),
 (1390, 1403),
 (1403, 1441),
 (1441, 1452),
 (1452, 1464),
 (1464, 1481),
 (1481, 1492),
 (1492, 1522),
 (1522, 1564),
 (1564, 1596),
 (1596, 1614),
 (1614, 1646),
 (1646, 1671),
 (1671, 1715),
 (1715, 1759),
 (1759, 1804),
 (1804, 1842),
 (1842, 1847),
 (1847, 1875),
 (1875, 1907),
 (1907, 1925),
 (1925, 1946),
 (1946, 1981),
 (1981, 1993),
 (1993, 2020),
 (2020, 2037)

In [49]:
list_segment_filled_numerics = []
for (x,y) in list_segment_start_end_indexes:
    numerics = []
    for i in range(x+1,y):
        numerics.append(i)
    list_segment_filled_numerics.append(numerics)
list_segment_filled_numerics[0:2]


[[1, 2, 3], [5, 6, 7, 8, 9, 10, 11, 12, 13]]

In [20]:
s=list(range(5))
print(s)
random.shuffle(s) # << shuffle before print or assignment
print(s)

[0, 1, 2, 3, 4]
[3, 2, 0, 4, 1]


In [50]:
random.shuffle(list_segment_filled_numerics)

In [51]:
list_segment_filled_numerics[0:2]

[[24915,
  24916,
  24917,
  24918,
  24919,
  24920,
  24921,
  24922,
  24923,
  24924,
  24925,
  24926,
  24927,
  24928,
  24929,
  24930,
  24931],
 [8238,
  8239,
  8240,
  8241,
  8242,
  8243,
  8244,
  8245,
  8246,
  8247,
  8248,
  8249,
  8250,
  8251,
  8252,
  8253,
  8254,
  8255,
  8256,
  8257,
  8258,
  8259,
  8260,
  8261,
  8262,
  8263,
  8264,
  8265,
  8266,
  8267,
  8268,
  8269,
  8270,
  8271,
  8272,
  8273,
  8274,
  8275,
  8276,
  8277,
  8278,
  8279,
  8280,
  8281]]

In [44]:
couples=[]
for i in range(0,len(list_segment_filled_numerics)):
    if(i%100==0):
        logger.info("create_groundtruths_from_same_segments started for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))
    couple1 = list_segment_filled_numerics[i]
    #print(couple1)
    for k in range(0,len(couple1)):
        for m in range(k, len(couple1)):
            if(k==m):
                continue
            couples.append((couple1[k],couple1[m],True))
    if(i%100==0):
        logger.info("create_groundtruths_from_same_segments completed for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))
        logger.info("create_groundtruths_from_different_segments started for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))
    
    ## To generate couples of sentences from different segments as possible, we apply some custom logic
    cnt = 0
    for j in range(i, len(list_segment_filled_numerics)):
        if(i==j):
            continue
        if((j-i)<20):
            #if it is slow on your machine, remove this condition
            continue
        if(cnt>40):
            ## 40 records per iteration is enough to create false labeled couples, if this condition is not set, there will be millions of false labeled couples which is no-sense
            break
        ##if(j%5==0): 
            ## skip 10 records to generate more distributed false labeled couples
        res = create_groundtruths_from_different_segments(list_segment_filled_numerics[i], list_segment_filled_numerics[j])                                            
        couples = couples + res
        cnt+=1
    if(i%100==0):
        logger.info("create_groundtruths_from_different_segments completed for " + str(i) + "th record out of " + str(len(list_segment_filled_numerics)))


#couples

KeyboardInterrupt: 

In [29]:
col_first_sentence_id = []
col_second_sentence_id = []
col_first_sentence = []
col_second_sentence= []
col_two_sentences_in_same_segment = []

for sen1,sen2,is_same in couples:
    col_first_sentence_id.append(sen1)
    col_second_sentence_id.append(sen2)
    col_first_sentence.append(lines[sen1])
    col_second_sentence.append(lines[sen2])
    col_two_sentences_in_same_segment.append(is_same)


In [30]:
for i in range(len(col_first_sentence_id)):
    if(i%100==0):
        logger.info(str(col_first_sentence_id[i])+","+str(col_second_sentence_id[i])+","+str(col_first_sentence[i])+","+str(col_second_sentence[i])+","+str(col_two_sentences_in_same_segment[i]))

In [31]:
data_tuples = list(zip(col_first_sentence_id,col_second_sentence_id,col_first_sentence,col_second_sentence,col_two_sentences_in_same_segment))

In [32]:
df = pd.DataFrame(data_tuples, columns=["first_sentence_id","second_sentence_id","first_sentence","second_sentence","two_sentences_in_same_segment"])

In [33]:
df.shape

(2122185, 5)

In [34]:
df.head()

Unnamed: 0,first_sentence_id,second_sentence_id,first_sentence,second_sentence,two_sentences_in_same_segment
0,6873,6874,Episode 18: Lord Brahma's four saintly sons vi...,The saints curse Jaya and Vijaya to be born th...,True
1,6873,6875,Episode 18: Lord Brahma's four saintly sons vi...,Jaya and Vijaya are born as Hiranyaksha and Hi...,True
2,6873,6876,Episode 18: Lord Brahma's four saintly sons vi...,Hiranyaksha steals the earth and hides it.,True
3,6873,6877,Episode 18: Lord Brahma's four saintly sons vi...,Lord Shri Vishnu incarnates as Varaha and kill...,True
4,6873,6878,Episode 18: Lord Brahma's four saintly sons vi...,Hiranyakashipu vows revenge.,True


In [35]:
df.two_sentences_in_same_segment.value_counts()

True     1466151
False     656034
Name: two_sentences_in_same_segment, dtype: int64

In [36]:
df_true = df[df["two_sentences_in_same_segment"]==True].sample(n=200000) 
df_true.shape

(200000, 5)

In [37]:
df_false = df[df["two_sentences_in_same_segment"]==False].sample(n=200000) 
df_false.shape

(200000, 5)

In [38]:
df_balanced = pd.concat([df_true,df_false])
df_balanced.shape

(400000, 5)

In [39]:
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
df_balanced.shape

(400000, 5)

In [40]:
df_balanced.head()

Unnamed: 0,first_sentence_id,second_sentence_id,first_sentence,second_sentence,two_sentences_in_same_segment
0,8804,8842,"At the end of the 19th century, the vines of W...",The two to three decades after the war are kno...,True
1,9014,30408,He had hoped to be accepted for pilot training...,The young princess was rumored to be expecting...,False
2,33984,34012,Say it.,"I promise you"" (29–30).",True
3,54494,54542,The opening up of different ways in experienci...,"Opposed to this imaginary was ""Reaction"", a ve...",True
4,59038,59170,"'Fathers and Sons"" –",'Federalist No.,True


## Store the Balanced Dataset to use later in a BERT classification task

In [92]:
df_balanced.to_pickle("/Users/emrecalisir/git/data/wiki/wiki_test_2.bert")

In [30]:
df_balanced = pd.read_pickle("/Users/emrecalisir/git/data/wiki/wiki_test_2.bert")

In [1]:
df_balanced.head()

NameError: name 'df_balanced' is not defined