In [20]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/92/e8/86c36e1b13007ca9c89381adac6c078cfc8fb71841a76c08a3fe3eca91d3/imbalanced_learn-0.12.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
   ---------------------------------------- 0.0/257.7 kB ? eta -:--:--
   ------ --------------------------------- 41.0/257.7 kB 1.9 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:0

In [1]:
import glob
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler 
import os

## Using All Agree File

In [16]:
file_directory = r"C:\Users\fagos\PycharmProjects\pythonProject\DDB-4\Financial-News-Sentiment-Analysis-NLP-Techniques\data\processed\financial_phrasebank"
file_pattern = "sentences_allagree_processed_ver1.2.csv"

matching_file = glob.glob(os.path.join(file_directory, file_pattern))

financial_phrases = pd.read_csv(matching_file[0])
financial_phrases.dropna(inplace=True)
financial_phrases

Unnamed: 0,entities,label
0,accord gran company plan production russia com...,1
1,quarter componenta net sale double eur eur per...,2
2,quarter net sale increase eur operating profit...,2
3,operating profit rise eur eur corresponding pe...,2
4,operate profit total eur eur represent net sale,2
...,...,...
2259,operate result month period decrease profit eu...,0
2260,helsinki thomson financial share cargotec fall...,0
2261,london marketwatch share price end lower londo...,0
2262,operating profit fall eur eur include vessel s...,0


In [4]:
financial_phrases['label'].value_counts()

label
1    1391
2     570
0     303
Name: count, dtype: int64

In [22]:
# Minority class
neg = financial_phrases[financial_phrases['label'] == 0]

# Undersampling
pos = financial_phrases[financial_phrases['label'] == 2]
pos = pos.sample(len(neg), random_state=10)
neu = financial_phrases[financial_phrases['label'] == 1]
neu = neu.sample(len(neg), random_state=10)

print(pos.shape, neg.shape, neu.shape)

(303, 2) (303, 2) (303, 2)


In [31]:
balanced_dataset = pd.concat([pos, neu, neg], axis = 0)
balanced_dataset = balanced_dataset.reset_index()
balanced_dataset = balanced_dataset.drop(columns = ['index'])

In [33]:
balanced_dataset['label'].value_counts()

label
2    303
1    303
0    303
Name: count, dtype: int64

In [35]:
file_name = "sentences_allagree_processed_ver2_balanced.csv"
balanced_dataset.to_csv(os.path.join(file_directory, file_name), index=False)

## Undersampling

In [17]:
#X_str = [' '.join(tokens) for tokens in financial_phrases['entities']]

# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(financial_phrases['entities'])

#Setting the minimum number of classes to be equal to the smallest class. 
n_samples_per_class = 303

# Create an instance of RandomUnderSampler
rusU = RandomUnderSampler(sampling_strategy={0: n_samples_per_class,
                                            1: n_samples_per_class,
                                            2: n_samples_per_class})

# Perform random under-sampling
X_resampled_U, y_resampled_U = rusU.fit_resample(X_tfidf, y)

In [18]:
y_resampled_U.value_counts()

label
0    303
1    303
2    303
Name: count, dtype: int64

In [19]:
financial_phrases_under = pd.DataFrame(X_resampled_U.toarray())
financial_phrases_under['label'] = y_resampled_U
financial_phrases_under.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4184,4185,4186,4187,4188,4189,4190,4191,4192,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
file_name = "sentences_allagree_processed_ver2_balanced.csv"
financial_phrases_under.tocsv(os.path.join(file_directory, file_name), index=False)