In [1]:
from datasets import load_dataset

# Specify the dataset you want to download
dataset_name = "financial_phrasebank"

# Load the dataset
dataset = load_dataset(dataset_name, 'sentences_allagree')

Found cached dataset financial_phrasebank (/Users/carlosvarela/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})

In [3]:
# Access the dataset training split (only one available)
train_data = dataset["train"]

In [4]:
import pandas as pd
import spacy

In [6]:
#converting to dataframes for initial exploration:
train_df = pd.DataFrame(train_data)
print('train dataframe info:')
print(train_df.info())

train dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  2264 non-null   object
 1   label     2264 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 35.5+ KB
None


In [9]:
def remove_dups(df):
    
    duplicates = df['sentence'].duplicated()
    if True in duplicates.unique():
        df['sentence'].drop_duplicates()
        print('Duplicates removed')
    else:
        print('No duplicates found')
        
    return df
# Remove duplicates:
remove_dups(train_df)

# Remove special characters and bring to lowercase:
train_df['cleaned title'] = train_df['sentence'].str.lower().str.replace('[0-9\W_]+', ' ', regex=True)
train_df.head(5)

Duplicates removed


Unnamed: 0,sentence,label,cleaned title
0,"According to Gran , the company has no plans t...",1,according to gran the company has no plans to ...
1,"For the last quarter of 2010 , Componenta 's n...",2,for the last quarter of componenta s net sales...
2,"In the third quarter of 2010 , net sales incre...",2,in the third quarter of net sales increased by...
3,Operating profit rose to EUR 13.1 mn from EUR ...,2,operating profit rose to eur mn from eur mn in...
4,"Operating profit totalled EUR 21.1 mn , up fro...",2,operating profit totalled eur mn up from eur m...


In [10]:
from spacy.lang.en.stop_words import STOP_WORDS
# Loading the language model:
nlp = spacy.load("en_core_web_sm")

# Applying model to a dataframe column:
train_df['docs'] = train_df['cleaned title'].apply(nlp)

# Defining a function to remove stop words and punctuations using spacy's assets:
def nlp_tokenizer(doc):
    docs_no_stops = [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS and not token.is_punct]
    return docs_no_stops

train_df['docs'] = train_df['docs'].apply(nlp_tokenizer)

In [11]:
train_df.head(10)

Unnamed: 0,sentence,label,cleaned title,docs
0,"According to Gran , the company has no plans t...",1,according to gran the company has no plans to ...,"[accord, gran, company, plan, production, russ..."
1,"For the last quarter of 2010 , Componenta 's n...",2,for the last quarter of componenta s net sales...,"[quarter, componenta, s, net, sale, double, eu..."
2,"In the third quarter of 2010 , net sales incre...",2,in the third quarter of net sales increased by...,"[quarter, net, sale, increase, eur, mn, operat..."
3,Operating profit rose to EUR 13.1 mn from EUR ...,2,operating profit rose to eur mn from eur mn in...,"[operating, profit, rise, eur, mn, eur, mn, co..."
4,"Operating profit totalled EUR 21.1 mn , up fro...",2,operating profit totalled eur mn up from eur m...,"[operate, profit, total, eur, mn, eur, mn, rep..."
5,Finnish Talentum reports its operating profit ...,2,finnish talentum reports its operating profit ...,"[finnish, talentum, report, operating, profit,..."
6,Clothing retail chain Sepp+ñl+ñ 's sales incre...,2,clothing retail chain sepp ñl ñ s sales increa...,"[clothing, retail, chain, sepp, ñl, ñ, s, sale..."
7,Consolidated net sales increased 16 % to reach...,2,consolidated net sales increased to reach eur ...,"[consolidated, net, sale, increase, reach, eur..."
8,Foundries division reports its sales increased...,2,foundries division reports its sales increased...,"[foundry, division, report, sale, increase, eu..."
9,"HELSINKI ( AFX ) - Shares closed higher , led ...",2,helsinki afx shares closed higher led by nokia...,"[helsinki, afx, share, close, higher, lead, no..."


In [13]:
all_results = []

# Extract unique strings with exactly 2 characters and store in the list
for doc in train_df['docs']:
    result = list(set(word for word in doc if len(word) == 2))
    all_results.append(result)

# Print all results
print(all_results)

[[], [], ['mn'], ['mn'], ['mn'], ['mn'], ['ñl', 'mn'], [], ['mn'], [], [], [], ['mn'], [], [], [], [], [], [], [], ['mn'], [], [], [], ['mn'], [], [], ['mn'], [], [], [], [], [], [], [], [], ['bn'], [], [], [], [], [], ['mn'], [], [], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], [], [], [], ['ñl', 'mn'], [], [], [], [], ['mn'], ['mn'], ['ac'], [], [], ['mn'], ['mn'], [], ['mn'], ['mn'], ['mn'], [], ['ac'], [], ['mn'], [], [], [], [], [], [], ['bn'], ['mn'], ['mn'], ['mn'], [], ['mn'], [], ['mn'], ['ls'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['mn'], [], [], [], [], [], [], [], [], [], [], ['se'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['mn'], ['mn'], ['mn'], [], [], [], [], [], [], [], ['vv'], [], [], [], [], [], [], [], ['mn'], ['mn'], ['mn'], ['mn'], ['bn'], ['mn'], ['mn'], ['mn'], [], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], [], [], ['mn'], ['mn'], ['mn'], ['mn'], [], [], [], [], ['mn'], ['mn'], ['mn'],

In [15]:
# Filter out empty lists and keep only unique values
cleaned_results = [list(set(result)) for result in all_results if result]

print(cleaned_results)

[['mn'], ['mn'], ['mn'], ['mn'], ['ñl', 'mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['bn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['ñl', 'mn'], ['mn'], ['mn'], ['ac'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['ac'], ['mn'], ['bn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['ls'], ['mn'], ['se'], ['mn'], ['mn'], ['mn'], ['vv'], ['mn'], ['mn'], ['mn'], ['mn'], ['bn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['ac'], ['mn'], ['mn'], ['mn'], ['mn'], ['ac'], ['ac'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['le'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['bn'], ['co'], ['st'], ['ag', 'jp', 'co'], ['va'], ['mr', 'pc'], ['pt'], ['mn'], ['co'], ['ac'], ['mr'], ['mn'], ['mo', 'eb'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['mn'], ['th', 'mn'], ['mn'

In [16]:
# Filter out strings of two or fewer characters from the lists in the 'docs' column
train_df['docs'] = train_df['docs'].apply(lambda doc: [word for word in doc if len(word) > 2])

In [17]:
train_df.head(10)

Unnamed: 0,sentence,label,cleaned title,docs
0,"According to Gran , the company has no plans t...",1,according to gran the company has no plans to ...,"[accord, gran, company, plan, production, russ..."
1,"For the last quarter of 2010 , Componenta 's n...",2,for the last quarter of componenta s net sales...,"[quarter, componenta, net, sale, double, eur, ..."
2,"In the third quarter of 2010 , net sales incre...",2,in the third quarter of net sales increased by...,"[quarter, net, sale, increase, eur, operating,..."
3,Operating profit rose to EUR 13.1 mn from EUR ...,2,operating profit rose to eur mn from eur mn in...,"[operating, profit, rise, eur, eur, correspond..."
4,"Operating profit totalled EUR 21.1 mn , up fro...",2,operating profit totalled eur mn up from eur m...,"[operate, profit, total, eur, eur, represent, ..."
5,Finnish Talentum reports its operating profit ...,2,finnish talentum reports its operating profit ...,"[finnish, talentum, report, operating, profit,..."
6,Clothing retail chain Sepp+ñl+ñ 's sales incre...,2,clothing retail chain sepp ñl ñ s sales increa...,"[clothing, retail, chain, sepp, sale, increase..."
7,Consolidated net sales increased 16 % to reach...,2,consolidated net sales increased to reach eur ...,"[consolidated, net, sale, increase, reach, eur..."
8,Foundries division reports its sales increased...,2,foundries division reports its sales increased...,"[foundry, division, report, sale, increase, eu..."
9,"HELSINKI ( AFX ) - Shares closed higher , led ...",2,helsinki afx shares closed higher led by nokia...,"[helsinki, afx, share, close, higher, lead, no..."


In [18]:
processed_df = train_df[['docs','label']]
processed_df

Unnamed: 0,docs,label
0,"[accord, gran, company, plan, production, russ...",1
1,"[quarter, componenta, net, sale, double, eur, ...",2
2,"[quarter, net, sale, increase, eur, operating,...",2
3,"[operating, profit, rise, eur, eur, correspond...",2
4,"[operate, profit, total, eur, eur, represent, ...",2
...,...,...
2259,"[operate, result, month, period, decrease, pro...",0
2260,"[helsinki, thomson, financial, share, cargotec...",0
2261,"[london, marketwatch, share, price, end, lower...",0
2262,"[operating, profit, fall, eur, eur, include, v...",0


In [19]:
processed_df['entities'] = processed_df['docs'].apply(lambda tokens: ' '.join(tokens))
processed_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df['entities'] = processed_df['docs'].apply(lambda tokens: ' '.join(tokens))


Unnamed: 0,docs,label,entities
0,"[accord, gran, company, plan, production, russ...",1,accord gran company plan production russia com...
1,"[quarter, componenta, net, sale, double, eur, ...",2,quarter componenta net sale double eur eur per...
2,"[quarter, net, sale, increase, eur, operating,...",2,quarter net sale increase eur operating profit...
3,"[operating, profit, rise, eur, eur, correspond...",2,operating profit rise eur eur corresponding pe...
4,"[operate, profit, total, eur, eur, represent, ...",2,operate profit total eur eur represent net sale
...,...,...,...
2259,"[operate, result, month, period, decrease, pro...",0,operate result month period decrease profit eu...
2260,"[helsinki, thomson, financial, share, cargotec...",0,helsinki thomson financial share cargotec fall...
2261,"[london, marketwatch, share, price, end, lower...",0,london marketwatch share price end lower londo...
2262,"[operating, profit, fall, eur, eur, include, v...",0,operating profit fall eur eur include vessel s...


In [20]:
model_df = processed_df[['entities','label']]
model_df

Unnamed: 0,entities,label
0,accord gran company plan production russia com...,1
1,quarter componenta net sale double eur eur per...,2
2,quarter net sale increase eur operating profit...,2
3,operating profit rise eur eur corresponding pe...,2
4,operate profit total eur eur represent net sale,2
...,...,...
2259,operate result month period decrease profit eu...,0
2260,helsinki thomson financial share cargotec fall...,0
2261,london marketwatch share price end lower londo...,0
2262,operating profit fall eur eur include vessel s...,0


In [21]:
model_df.to_csv('sentences_allagree_processed_ver1.2.csv', index = False)