In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import uuid
import os

In [2]:
fincausal_task1_trial_path = Path('/media/sarthak/HDD/data_science/fnp_resources/data/task1/v2/trial.csv')
fincausal_task1_practice_path = Path('/media/sarthak/HDD/data_science/fnp_resources/data/task1/v2/practice.csv')

In [3]:
fincausal_task1_trial = pd.read_csv(fincausal_task1_trial_path, sep='; ')
fincausal_task1_practice = pd.read_csv(fincausal_task1_practice_path, sep='; ')

  """Entry point for launching an IPython kernel.
  


In [4]:
fincausal_task1_trial.head()

Unnamed: 0,Index,Text,Gold
0,1.00001,Third Democratic presidential debate Septembe...,0
1,1.00002,"On the policy front, Bernie Sanders claimed hi...",0
2,1.00003,Joe Biden misrepresented recent history when h...,0
3,1.00004,Here's a look at some of the assertions in the...,0
4,1.00005,"It killed 22 people, and injured many more, we...",0


In [5]:
fincausal_task1_practice.head()

Unnamed: 0,Index,Text,Gold
0,1.00001,Florida raking in billions as Americans abando...,0
1,1.00002,"Recently, changes to the U.S. tax code have en...",0
2,1.00003,"MORE FROM FOXBUSINESS.COM... As it turns out, ...",0
3,1.00004,"According to a new study from LendingTree, whi...",0
4,1.00005,The Sunshine State drew in a net influx of abo...,1


In [32]:
print(fincausal_task1_practice.shape, fincausal_task1_trial.shape)

(13478, 3) (8580, 3)


There were whitexpaces in column name when using sep=';', which is now removed. We must save them with whitespaces removed to avoid coding the logic in package code

In [26]:
# NAs practice
fincausal_task1_practice.isna().sum()

Index    0
Text     3
Gold     0
dtype: int64

In [27]:
# NAs trial
fincausal_task1_trial.isna().sum()

Index    0
Text     1
Gold     0
dtype: int64

There are some NAs in the dataset

In [8]:
# empty sentences practice?
empty_sections_practice = fincausal_task1_practice.loc[(fincausal_task1_practice.Text == " ") | (fincausal_task1_practice.Text == "")]
print('how many? {}'.format(len(empty_sections_practice)))
print('label distribution for them: {}'.format(empty_sections_practice.Gold.value_counts()))

how many? 0
label distribution for them: Series([], Name: Gold, dtype: int64)


In [9]:
# empty sentences trial?
empty_sections_trial = fincausal_task1_trial.loc[(fincausal_task1_trial.Text == " ") | (fincausal_task1_trial.Text == "")]
print('how many? {}'.format(len(empty_sections_trial)))
print('label distribution for them: {}'.format(empty_sections_trial.Gold.value_counts()))

how many? 0
label distribution for them: Series([], Name: Gold, dtype: int64)


There is 1 empty text in trial data and 3 in practice data. All of them have 0 labels. We keep them becaue they are probable instances during the inference time. However, we remove them from training the model as they can disturb the training process, instead just return 0 when they arrive during inference time

UPDATE: There are no empty text sections

In [10]:
# label distributions in both the sets? - both have roughly same distribution
fincausal_task1_trial_label_distribution = fincausal_task1_trial.Gold.value_counts()
fincausal_task1_practice_label_distribution = fincausal_task1_practice.Gold.value_counts()

print('label distribution of trial: {}'.format(fincausal_task1_trial_label_distribution))
print('label distribution of practice: {}'.format(fincausal_task1_practice_label_distribution))

label distribution of trial: 0    8011
1     569
Name: Gold, dtype: int64
label distribution of practice: 0    12468
1     1010
Name: Gold, dtype: int64


Trial has 6.63% positives, and practice has 7.49% positives. We must use that for class weighting, as well as maintain the distribution while splitting

In [25]:
# do they have any EXACTLY same texts?
len(list(set(fincausal_task1_trial.Text).intersection(set(fincausal_task1_practice.Text))))

206

There are 206 exactly same texts between practice and trial, and we keep them

In [12]:
# are the labels for the equal texts same?
merged_inner = pd.merge(left=fincausal_task1_trial, 
                        right=fincausal_task1_practice, 
                        left_on='Text', 
                        right_on='Text',
                        suffixes=('_trial', '_practice'))
merged_inner.head()

Unnamed: 0,Index_trial,Text,Gold_trial,Index_practice,Gold_practice
0,2.00007,"The current ratio, also known as the working c...",0,316.00076,0
1,2.00008,The ratio is simply calculated by dividing cur...,0,316.00077,0
2,2.00009,"Typically, the higher the current ratio the be...",0,316.00078,0
3,2.00018,Enterprise Value is calculated by taking the m...,0,316.00089,0
4,2.00019,The average FCF of a company is determined by ...,0,316.0009,0


In [33]:
merged_inner.Gold_trial.equals(merged_inner.Gold_practice)

True

In [34]:
# what rows are there which have different labels for same text?
merged_inner.loc[merged_inner.Gold_trial != merged_inner.Gold_practice]

Unnamed: 0,Index_trial,Text,Gold_trial,Index_practice,Gold_practice


Above are the 0 rows which have same texts but different golds, we must remove them.

In [15]:
# do trial and practice dfs have common indexes?
common_indexes = list(set(fincausal_task1_practice.Index).intersection(fincausal_task1_trial.Index))

In [16]:
len(common_indexes)

4561

In [17]:
# is the text also equal?
fincausal_task1_practice.loc[fincausal_task1_practice.Index.isin(common_indexes)].Text.equals(fincausal_task1_trial.loc[fincausal_task1_trial.Index.isin(common_indexes)])

False

There are 4561 common index values between trial and practice, and might be a problem after concatenating to have unique values. Therefore, create your own unique ID after concatenating

# Splitting

In [35]:
all_data = pd.concat([fincausal_task1_practice, fincausal_task1_trial])
all_data['unique_id'] = [str(uuid.uuid4()) for _ in range(len(all_data))]
all_data = all_data[~all_data.Text.isna()]

In [36]:
all_data.isna().sum()

Index        0
Text         0
Gold         0
unique_id    0
dtype: int64

In [37]:
all_data.shape

(22054, 4)

In [38]:
all_data.Gold.value_counts() / len(all_data)

0    0.928403
1    0.071597
Name: Gold, dtype: float64

In [31]:
"""
# 1. remove the indexes with empty text
fincausal_task1_trial_nonemptytext = fincausal_task1_trial.loc[~fincausal_task1_trial.Index.isin(empty_sections_trial)]
fincausal_task1_practice_nonemptytext = fincausal_task1_practice.loc[~fincausal_task1_practice.Index.isin(empty_sections_practice)]
print(fincausal_task1_trial_nonemptytext.shape, fincausal_task1_practice_nonemptytext.shape)

# 2. remove the indexes from trial and practice (with conflicting labels) before combining
fincausal_task1_trial_nonemptytext_noconflictgold = fincausal_task1_trial_nonemptytext.loc[~fincausal_task1_trial_nonemptytext.Index.isin(merged_inner.Index_trial)]
fincausal_task1_practice_nonemptytext_noconflictgold = fincausal_task1_practice_nonemptytext.loc[~fincausal_task1_practice_nonemptytext.Index.isin(merged_inner.Index_practice)]
print(fincausal_task1_trial_nonemptytext_noconflictgold.shape, fincausal_task1_practice_nonemptytext_noconflictgold.shape)

# 3. trial and practice data can be combined
fincausal_task1_combined_nonemptytext_noconflictgold = pd.concat([fincausal_task1_trial_nonemptytext_noconflictgold, fincausal_task1_practice_nonemptytext_noconflictgold])

# 4. Create a unique ID column
fincausal_task1_combined_nonemptytext_noconflictgold['unique_id'] = [str(uuid.uuid4()) for _ in range(len(fincausal_task1_combined_nonemptytext_noconflictgold))]

# 5. shuffle
fincausal_task1_combined_nonemptytext_noconflictgold = fincausal_task1_combined_nonemptytext_noconflictgold.sample(frac=1).reset_index(drop=True)
"""


(8580, 3) (13478, 3)
(8331, 3) (13179, 3)


# Random Splits

In [39]:
from sklearn.model_selection import train_test_split

In [47]:
train, test = train_test_split(all_data, stratify=all_data.Gold, random_state=42, test_size=0.1)
train, dev = train_test_split(train, stratify=train.Gold, random_state=42, test_size=0.1)
print(train.shape, dev.shape, test.shape)

(17863, 4) (1985, 4) (2206, 4)


In [48]:
train.Gold.value_counts() / len(train)

0    0.928399
1    0.071601
Name: Gold, dtype: float64

In [49]:
dev.Gold.value_counts() / len(dev)

0    0.928463
1    0.071537
Name: Gold, dtype: float64

In [50]:
test.Gold.value_counts() / len(test)

0    0.928377
1    0.071623
Name: Gold, dtype: float64

In [52]:
train.to_csv('/media/sarthak/HDD/data_science/fnp_resources/data/task1/all_combined/train.csv', index=False)
dev.to_csv('/media/sarthak/HDD/data_science/fnp_resources/data/task1/all_combined/dev.csv', index=False)
test.to_csv('/media/sarthak/HDD/data_science/fnp_resources/data/task1/all_combined/test.csv', index=False)

# k-fold splits

In [96]:
# 6. divide the data into k-folds with each fold having representative distribution
from sklearn.model_selection import StratifiedKFold, train_test_split

In [93]:
skf = StratifiedKFold(n_splits=5)

In [94]:
train_dfs = []
test_dfs = []

X = fincausal_task1_combined_nonemptytext_noconflictgold
y = fincausal_task1_combined_nonemptytext_noconflictgold.Gold

for train_index, test_index in skf.split(X, y):
    train = fincausal_task1_combined_nonemptytext_noconflictgold.iloc[train_index]
    test = fincausal_task1_combined_nonemptytext_noconflictgold.iloc[test_index]
    
    train_dfs.append(train)
    test_dfs.append(test)

In [103]:
# 7. for each train_df, create a val_df, and save train, val and test dfs as csv
data_root_dir = Path('/media/sarthak/HDD/data_science/fnp_resources/data/task1')
for i, (train_df, test_df) in enumerate(zip(train_dfs, test_dfs)):
    train_df, val_df = train_test_split(train_df, 
                                        test_size=0.1, 
                                        stratify=train_df.Gold,
                                        random_state=42)
    iteration_dir = data_root_dir / str('iteration_{}').format(i+1)
    if not os.path.exists(iteration_dir):
        os.mkdir(iteration_dir)
        train_df.to_csv(Path(iteration_dir / 'train.csv'), index=False)
        val_df.to_csv(Path(iteration_dir / 'val.csv'), index=False)
        test_df.to_csv(Path(iteration_dir / 'test.csv'), index=False)
    else:
        raise Exception('Iteration dir already exists. Delete the directory first.')