In [20]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/92/e8/86c36e1b13007ca9c89381adac6c078cfc8fb71841a76c08a3fe3eca91d3/imbalanced_learn-0.12.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
   ---------------------------------------- 0.0/257.7 kB ? eta -:--:--
   ------ --------------------------------- 41.0/257.7 kB 1.9 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:00:01
   --------- ------------------------------ 61.4/257.7 kB 1.6 MB/s eta 0:0

In [1]:
import glob
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler 
import os

## Concatenating all files

In [11]:
file_directory = r"C:\Users\fagos\PycharmProjects\pythonProject\DDB-4\Financial-News-Sentiment-Analysis-NLP-Techniques\data\processed\financial_phrasebank"
file_pattern = "*_processed.csv"
matching_files = glob.glob(os.path.join(file_directory, file_pattern))
all_data = []

for file_name in matching_files:
    # Read each CSV file into a dataframe
    df = pd.read_csv(file_name)
    # Append the dataframe to the list of all data
    all_data.append(df)

# Concatenate all dataframes into a single dataframe
financial_phrases = pd.concat(all_data, ignore_index=True)

# Print or process the concatenated dataframe as needed
financial_phrases

Unnamed: 0,docs,label
0,"['accord', 'gran', 'company', 'plan', 'product...",1
1,"['technopoli', 'plan', 'develop', 'stage', 'ar...",1
2,"['international', 'electronic', 'industry', 'c...",0
3,"['new', 'production', 'plant', 'company', 'inc...",2
4,"['accord', 'company', 'update', 'strategy', 'y...",2
...,...,...
14759,"['operate', 'result', '12', 'month', 'period',...",0
14760,"['helsinki', 'thomson', 'financial', 'share', ...",0
14761,"['london', 'marketwatch', 'share', 'price', 'e...",0
14762,"['operating', 'profit', 'fall', 'eur', '35.4',...",0


## Using All Agree File

In [13]:
file_directory = r"C:\Users\fagos\PycharmProjects\pythonProject\DDB-4\Financial-News-Sentiment-Analysis-NLP-Techniques\data\processed\financial_phrasebank"
file_pattern = "sentences_allagree_processed.csv"

matching_file = glob.glob(os.path.join(file_directory, file_pattern))

financial_phrases = pd.read_csv(matching_file[0])
financial_phrases

Unnamed: 0,docs,label
0,"['accord', 'gran', 'company', 'plan', 'product...",1
1,"['quarter', '2010', 'componenta', 'net', 'sale...",2
2,"['quarter', '2010', 'net', 'sale', 'increase',...",2
3,"['operating', 'profit', 'rise', 'eur', '13.1',...",2
4,"['operate', 'profit', 'total', 'eur', '21.1', ...",2
...,...,...
2254,"['operate', 'result', '12', 'month', 'period',...",0
2255,"['helsinki', 'thomson', 'financial', 'share', ...",0
2256,"['london', 'marketwatch', 'share', 'price', 'e...",0
2257,"['operating', 'profit', 'fall', 'eur', '35.4',...",0


In [14]:
financial_phrases['label'].value_counts()

label
1    1386
2     570
0     303
Name: count, dtype: int64

In [15]:
financial_phrases['docs'] = financial_phrases['docs'].apply(ast.literal_eval)
y = financial_phrases['label']

## Undersampling

In [18]:
X_str = [' '.join(tokens) for tokens in financial_phrases['docs']]

# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X_str)

#Setting the minimum number of classes to be equal to the smallest class. 
n_samples_per_class = 303

# Create an instance of RandomUnderSampler
rusU = RandomUnderSampler(sampling_strategy={0: n_samples_per_class,
                                            1: n_samples_per_class,
                                            2: n_samples_per_class})

# Perform random under-sampling
X_resampled_U, y_resampled_U = rusU.fit_resample(X_tfidf, y)

In [19]:
y_resampled_U.value_counts()

label
0    303
1    303
2    303
Name: count, dtype: int64

In [25]:
financial_phrases_under = pd.DataFrame(X_resampled_U.toarray())
financial_phrases_under['label'] = y_resampled_U
financial_phrases_under.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5006,5007,5008,5009,5010,5011,5012,5013,5014,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
file_name = "sentences_allagree_processed.csv"
financial_phrases_under.tocsv(os.path.join(file_directory, file_name), index=False)

## Oversampling

In [17]:
#Setting the maximum number of classes to be equal to the largest class. 
n_samples_per_class = 8935

# Assuming y contains labels for the three-class problem
# Create an instance of RandomUnderSampler
rusO = RandomOverSampler(sampling_strategy={0: n_samples_per_class,
                                            1: n_samples_per_class,
                                            2: n_samples_per_class})

# Perform random under-sampling
X_resampled_O, y_resampled_O = rusO.fit_resample(X_tfidf, y)

In [18]:
y_resampled_O.value_counts()

label
1    8935
0    8935
2    8935
Name: count, dtype: int64