In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import uuid
import os

In [2]:
fincausal_task1_trial_path = Path('/media/sarthak/HDD/data_science/fnp_resources/data/TRIAL/fnp2020-fincausal-task1.csv')
fincausal_task1_practice_path = Path('/media/sarthak/HDD/data_science/fnp_resources/data/PRACTICE/fnp2020-fincausal2-task1.csv')

In [3]:
fincausal_task1_trial = pd.read_csv(fincausal_task1_trial_path, sep=';')
fincausal_task1_practice = pd.read_csv(fincausal_task1_practice_path, sep=';')

In [4]:
fincausal_task1_trial.columns = [col.strip() for col in fincausal_task1_trial.columns]
fincausal_task1_practice.columns = [col.strip() for col in fincausal_task1_practice.columns]

In [5]:
fincausal_task1_trial.head()

Unnamed: 0,Index,Text,Gold
0,1.00001,Third Democratic presidential debate Septemb...,0
1,1.00002,"On the policy front, Bernie Sanders claimed h...",0
2,1.00003,Joe Biden misrepresented recent history when ...,0
3,1.00004,Here's a look at some of the assertions in th...,0
4,1.00005,"It killed 22 people, and injured many more, w...",0


In [6]:
fincausal_task1_practice.head()

Unnamed: 0,Index,Text,Gold
0,1.00001,Florida raking in billions as Americans aband...,0
1,1.00002,"Recently, changes to the U.S. tax code have e...",0
2,1.00003,"MORE FROM FOXBUSINESS.COM... As it turns out,...",0
3,1.00004,"According to a new study from LendingTree, wh...",0
4,1.00005,The Sunshine State drew in a net influx of ab...,1


There were whitexpaces in column name when using sep=';', which is now removed. We must save them with whitespaces removed to avoid coding the logic in package code

In [7]:
# NAs practice
fincausal_task1_practice.isna().sum()

Index    0
Text     0
Gold     0
dtype: int64

In [8]:
# NAs trial
fincausal_task1_trial.isna().sum()

Index    0
Text     0
Gold     0
dtype: int64

There are no NAs in the dataset

In [9]:
# empty sentences practice?
empty_sections_practice = fincausal_task1_practice.loc[(fincausal_task1_practice.Text == " ") | (fincausal_task1_practice.Text == "")]
print('how many? {}'.format(len(empty_sections_practice)))
print('label distribution for them: {}'.format(empty_sections_practice.Gold.value_counts()))

how many? 3
label distribution for them: 0    3
Name: Gold, dtype: int64


In [10]:
# empty sentences trial?
empty_sections_trial = fincausal_task1_trial.loc[(fincausal_task1_trial.Text == " ") | (fincausal_task1_trial.Text == "")]
print('how many? {}'.format(len(empty_sections_trial)))
print('label distribution for them: {}'.format(empty_sections_trial.Gold.value_counts()))

how many? 1
label distribution for them: 0    1
Name: Gold, dtype: int64


There is 1 empty text in trial data and 3 in practice data. All of them have 0 labels. We keep them becaue they are probable instances during the inference time. However, we remove them from training the model as they can disturb the training process, instead just return 0 when they arrive during inference time

In [11]:
# label distributions in both the sets? - both have roughly same distribution
fincausal_task1_trial_label_distribution = fincausal_task1_trial.Gold.value_counts()
fincausal_task1_practice_label_distribution = fincausal_task1_practice.Gold.value_counts()

print('label distribution of trial: {}'.format(fincausal_task1_trial_label_distribution))
print('label distribution of practice: {}'.format(fincausal_task1_practice_label_distribution))

label distribution of trial: 0    8058
1     570
Name: Gold, dtype: int64
label distribution of practice: 0    12520
1     1027
Name: Gold, dtype: int64


Trial has 7.07% positives, and practice has 8.20% positives. We must use that for class weighting, as well as maintain the distribution while splitting

In [13]:
# do they have any EXACTLY same texts?
len(list(set(fincausal_task1_trial.Text).intersection(set(fincausal_task1_practice.Text))))

204

There are 204 exactly same texts between practice and trial, and we keep them

In [14]:
# are the labels for the equal texts same?
merged_inner = pd.merge(left=fincausal_task1_trial, 
                        right=fincausal_task1_practice, 
                        left_on='Text', 
                        right_on='Text',
                        suffixes=('_trial', '_practice'))
merged_inner.head()

Unnamed: 0,Index_trial,Text,Gold_trial,Index_practice,Gold_practice
0,1.00033,All rights reserved.,0,116.00027,0
1,351.00075,All rights reserved.,0,116.00027,0
2,2.00007,"The current ratio, also known as the working ...",0,316.00076,0
3,2.00008,The ratio is simply calculated by dividing cu...,0,316.00077,0
4,2.00009,"Typically, the higher the current ratio the b...",0,316.00078,0


In [15]:
merged_inner.Gold_trial.equals(merged_inner.Gold_practice)

False

In [16]:
# what rows are there which have different labels for same text?
merged_inner.loc[merged_inner.Gold_trial != merged_inner.Gold_practice]

Unnamed: 0,Index_trial,Text,Gold_trial,Index_practice,Gold_practice
329,318.00019,The earnings component of an annuity withdraw...,0,363.00019,1
340,320.00003,"Richard Moriarty, chief executive of the UK C...",0,438.00006,1
344,325.00006,"So if the Market Price RAD is $400,000 and yo...",0,353.00008,1
345,325.00007,If you pay the full RAD there is no interest ...,0,353.00009,1
347,325.00009,"Under the new rate, the equivalent daily paym...",0,353.00011,1
349,325.00015,"If she moved in today, the MPIR would give he...",0,353.00016,1
359,348.00004,As these reserves are made up of retained pro...,0,73.00004,1
360,348.00004,As these reserves are made up of retained pro...,0,531.00005,1


Above are the 8 rows which have same texts but different golds, we must remove them.

In [17]:
# do trial and practice dfs have common indexes?
common_indexes = list(set(fincausal_task1_practice.Index).intersection(fincausal_task1_trial.Index))

In [18]:
len(common_indexes)

4693

In [19]:
# is the text also equal?
fincausal_task1_practice.loc[fincausal_task1_practice.Index.isin(common_indexes)].Text.equals(fincausal_task1_trial.loc[fincausal_task1_trial.Index.isin(common_indexes)])

False

There are 4693 common index values between trial and practice, and might be a problem after concatenating to have unique values. Therefore, create your own unique ID after concatenating

# Splitting

In [20]:
# 1. remove the indexes with empty text
fincausal_task1_trial_nonemptytext = fincausal_task1_trial.loc[~fincausal_task1_trial.Index.isin(empty_sections_trial)]
fincausal_task1_practice_nonemptytext = fincausal_task1_practice.loc[~fincausal_task1_practice.Index.isin(empty_sections_practice)]

# 2. remove the indexes from trial and practice (with conflicting labels) before combining
fincausal_task1_trial_nonemptytext_noconflictgold = fincausal_task1_trial_nonemptytext.loc[~fincausal_task1_trial_nonemptytext.Index.isin(merged_inner.Index_trial)]
fincausal_task1_practice_nonemptytext_noconflictgold = fincausal_task1_practice_nonemptytext.loc[~fincausal_task1_practice_nonemptytext.Index.isin(merged_inner.Index_practice)]

# 3. trial and practice data can be combined
fincausal_task1_combined_nonemptytext_noconflictgold = pd.concat([fincausal_task1_trial_nonemptytext_noconflictgold, fincausal_task1_practice_nonemptytext_noconflictgold])

# 4. Create a unique ID column
fincausal_task1_combined_nonemptytext_noconflictgold['unique_id'] = [str(uuid.uuid4()) for _ in range(len(fincausal_task1_combined_nonemptytext_noconflictgold))]

In [21]:
# 5. shuffle the dataset
fincausal_task1_combined_nonemptytext_noconflictgold = fincausal_task1_combined_nonemptytext_noconflictgold.sample(frac=1).reset_index(drop=True)

In [22]:
fincausal_task1_combined_nonemptytext_noconflictgold.head()

Unnamed: 0,Index,Text,Gold,unique_id
0,476.00009,Valuation and Earnings This table compares B...,0,5e626c67-0b64-40da-be24-00d77645531e
1,116.00016,To qualify for the 10% CGT rate under 'invest...,1,bd4a1b84-9d03-438e-a212-1ce409238173
2,289.00011,"Regarding GST, he said that collection of the...",0,737475b8-acb1-4260-aa09-a81e0e165d1d
3,452.00033,San Francisco zoned the 12 properties for a m...,1,7e858720-423f-4196-b847-940d6a9f3e33
4,352.00018,I think that open line of communication kept ...,0,46d3776e-80b7-4141-929c-72226a913d2f


In [84]:
fincausal_task1_combined_nonemptytext_noconflictgold.shape

(21633, 4)

In [23]:
fincausal_task1_combined_nonemptytext_noconflictgold.Gold.value_counts() / len(fincausal_task1_combined_nonemptytext_noconflictgold)

0    0.927888
1    0.072112
Name: Gold, dtype: float64

In [96]:
# 6. divide the data into k-folds with each fold having representative distribution
from sklearn.model_selection import StratifiedKFold, train_test_split

In [93]:
skf = StratifiedKFold(n_splits=5)

In [94]:
train_dfs = []
test_dfs = []

X = fincausal_task1_combined_nonemptytext_noconflictgold
y = fincausal_task1_combined_nonemptytext_noconflictgold.Gold

for train_index, test_index in skf.split(X, y):
    train = fincausal_task1_combined_nonemptytext_noconflictgold.iloc[train_index]
    test = fincausal_task1_combined_nonemptytext_noconflictgold.iloc[test_index]
    
    train_dfs.append(train)
    test_dfs.append(test)

In [103]:
# 7. for each train_df, create a val_df, and save train, val and test dfs as csv
data_root_dir = Path('/media/sarthak/HDD/data_science/fnp_resources/data/task1')
for i, (train_df, test_df) in enumerate(zip(train_dfs, test_dfs)):
    train_df, val_df = train_test_split(train_df, 
                                        test_size=0.1, 
                                        stratify=train_df.Gold,
                                        random_state=42)
    iteration_dir = data_root_dir / str('iteration_{}').format(i+1)
    if not os.path.exists(iteration_dir):
        os.mkdir(iteration_dir)
        train_df.to_csv(Path(iteration_dir / 'train.csv'), index=False)
        val_df.to_csv(Path(iteration_dir / 'val.csv'), index=False)
        test_df.to_csv(Path(iteration_dir / 'test.csv'), index=False)
    else:
        raise Exception('Iteration dir already exists. Delete the directory first.')