# financial_phrasebank Dataset Preparation

In [1]:
from datasets import load_dataset

# Specify the dataset you want to download
dataset_name = "financial_phrasebank"

# Load the dataset
dataset = load_dataset(dataset_name, 'sentences_50agree')

Found cached dataset financial_phrasebank (/Users/carlosvarela/.cache/huggingface/datasets/financial_phrasebank/sentences_50agree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
# Access the dataset training split (only one available)
train_data = dataset["train"]

In [3]:
train_data

Dataset({
    features: ['sentence', 'label'],
    num_rows: 4846
})

In [4]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt

In [5]:
#converting to dataframes for initial exploration:
train_df = pd.DataFrame(train_data)
print('train dataframe info:')
print(train_df.info())

train dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  4846 non-null   object
 1   label     4846 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 75.8+ KB
None


In [6]:
print('train dataframe description: ',train_df.describe())

train dataframe description:               label
count  4846.000000
mean      1.156624
std       0.617616
min       0.000000
25%       1.000000
50%       1.000000
75%       2.000000
max       2.000000


In [7]:
#Perform initial EDA prior to cleaning for training split:
from ydata_profiling import ProfileReport

pre_cleaning_profile = ProfileReport(train_df, title="Financial Phrasebank Profiling Report")
pre_cleaning_profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Uppon initial inspection:
- 6 duplicated rows were discovered
- Stop words need to clearly be removed
- No missing values founds

## Data Cleaning
**Deleting Unnecesary Data**
- We will remove leading or lagging spaces
- We will remove special characters prior to tokenization for better performance

In [8]:
def remove_dups(df):
    
    duplicates = df['sentence'].duplicated()
    if True in duplicates.unique():
        df['sentence'].drop_duplicates()
        print('Duplicates removed')
    else:
        print('No duplicates found')
        
    return df
# Remove duplicates:
remove_dups(train_df)

# Remove special characters and bring to lowercase:
train_df['cleaned title'] = train_df['sentence'].str.lower().replace('[\/:@]', '', regex=True)
train_df.head(5)

Duplicates removed


Unnamed: 0,sentence,label,cleaned title
0,"According to Gran , the company has no plans t...",1,"according to gran , the company has no plans t..."
1,Technopolis plans to develop in stages an area...,1,technopolis plans to develop in stages an area...
2,The international electronic industry company ...,0,the international electronic industry company ...
3,With the new production plant the company woul...,2,with the new production plant the company woul...
4,According to the company 's updated strategy f...,2,according to the company 's updated strategy f...


**Inquiries**
- Should we leave capital letters for certain words like USA?
- Should we keep punctuation for situations such as U.S.?

**Tokenization, lemmatization, and stop word removal**

- Tokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation in a string.
- spacy: Spacy is a library used for NLP. We will use it to work with text pre-processing, removing stop word, and to extract information from the text using modules and functions.

In conclusion, we will use spacy's libraries to tokenize and remove stopwords comparing against is pre-built stop word library.

**Preparing the records**

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS
# Loading the language model:
nlp = spacy.load("en_core_web_sm")

# Applying model to a dataframe column:
train_df['docs'] = train_df['cleaned title'].apply(nlp)

# Defining a function to remove stop words and punctuations using spacy's assets:
def nlp_tokenizer(doc):
    docs_no_stops = [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS and not token.is_punct]
    return docs_no_stops

train_df['docs'] = train_df['docs'].apply(nlp_tokenizer)

In [10]:
train_df.head(3)

Unnamed: 0,sentence,label,cleaned title,docs
0,"According to Gran , the company has no plans t...",1,"according to gran , the company has no plans t...","[accord, gran, company, plan, production, russ..."
1,Technopolis plans to develop in stages an area...,1,technopolis plans to develop in stages an area...,"[technopoli, plan, develop, stage, area, 100,0..."
2,The international electronic industry company ...,0,the international electronic industry company ...,"[international, electronic, industry, company,..."


In [11]:
train_docs = train_df[['docs','label']]
train_docs

Unnamed: 0,docs,label
0,"[accord, gran, company, plan, production, russ...",1
1,"[technopoli, plan, develop, stage, area, 100,0...",1
2,"[international, electronic, industry, company,...",0
3,"[new, production, plant, company, increase, ca...",2
4,"[accord, company, update, strategy, year, 2009...",2
...,...,...
4841,"[london, marketwatch, share, price, end, lower...",0
4842,"[rinkuskiai, beer, sale, fall, 6.5, cent, 4.16...",1
4843,"[operating, profit, fall, eur, 35.4, mn, eur, ...",0
4844,"[net, sale, paper, segment, decrease, eur, 221...",0


## EDA Post-cleaning

In [12]:
# Apply lambda function to convert all docs into strings:
train_df['entities'] = train_df['docs'].apply(lambda tokens: ' '.join(tokens))

# Generate EDA report
cleaned_train_set_profile = ProfileReport(train_df, title="Financial Phrasebank (training data) Report post cleaning")
cleaned_train_set_profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [13]:
cleaned_train_set_profile.to_file("cleaned_Phrasebank_train_set_profile.html")



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
modeling_df = train_df[['docs','label']]
modeling_df.to_csv('sentences_50agree_processed.csv', index = False)