In [1]:
import pandas as pd

file_path = 'FPB.csv'
data = pd.read_csv(file_path, header = None, encoding='ISO-8859-1')
data.head()

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [2]:
import nltk
nltk.download('punkt') 
nltk.download('stopwords') 
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

def tokenize_stem_remove_stopwords_punct(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token not in string.punctuation]
    return stemmed_tokens


In [4]:
first_five_processed = data.iloc[:5, 1].apply(tokenize_stem_remove_stopwords_punct)

first_five_processed

0    [accord, gran, compani, plan, move, product, r...
1    [technopoli, plan, develop, stage, area, less,...
2    [the, intern, electron, industri, compani, elc...
3    [with, new, product, plant, compani, would, in...
4    [accord, compani, 's, updat, strategi, year, 2...
Name: 1, dtype: object

In [5]:
from sklearn.model_selection import train_test_split

labels = data.iloc[:, 0]
text = data.iloc[:, 1]

# Splitting the data into training, validation, and test sets (80/10/10 split)
X_train, X_temp, y_train, y_temp = train_test_split(text, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Binary representation
vectorizer_binary = CountVectorizer(binary=True)
X_train_binary = vectorizer_binary.fit_transform(X_train)
X_test_binary = vectorizer_binary.transform(X_test)

# Frequency-based representation
vectorizer_freq = CountVectorizer(binary=False)
X_train_freq = vectorizer_freq.fit_transform(X_train)
X_test_freq = vectorizer_freq.transform(X_test)

# TF-IDF representation
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)


In [7]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_binary, y_train)

# Predictions for evaluation
predictions = model.predict(X_test_binary)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.preprocessing import label_binarize

# Binarize the labels for multi-class AUROC calculation
y_test_binarized = label_binarize(y_test, classes=sorted(y_train.unique()))
y_score = model.predict_proba(X_test_binary)

In [9]:
# Binary Model
model_binary = LogisticRegression()
model_binary.fit(X_train_binary, y_train)
predictions_binary = model_binary.predict(X_test_binary)
y_score_binary = model_binary.predict_proba(X_test_binary)

# Evaluation
auc_binary = roc_auc_score(y_test_binarized, y_score_binary, multi_class='ovr')
f1_macro_binary = f1_score(y_test, predictions_binary, average='macro')
f1_micro_binary = f1_score(y_test, predictions_binary, average='micro')

print(f"Binary Model - AUROC (OvR): {auc_binary}, Macro F1: {f1_macro_binary}, Micro F1: {f1_micro_binary}")


Binary Model - AUROC (OvR): 0.898840980468315, Macro F1: 0.7323926837008293, Micro F1: 0.7731958762886598


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Frequency-Based Model
model_freq = LogisticRegression()
model_freq.fit(X_train_freq, y_train)
predictions_freq = model_freq.predict(X_test_freq)
y_score_freq = model_freq.predict_proba(X_test_freq)

# Evaluation
auc_freq = roc_auc_score(y_test_binarized, y_score_freq, multi_class='ovr')
f1_macro_freq = f1_score(y_test, predictions_freq, average='macro')
f1_micro_freq = f1_score(y_test, predictions_freq, average='micro')

print(f"Frequency-Based Model - AUROC (OvR): {auc_freq}, Macro F1: {f1_macro_freq}, Micro F1: {f1_micro_freq}")


Frequency-Based Model - AUROC (OvR): 0.8952190729788353, Macro F1: 0.7378383218454935, Micro F1: 0.777319587628866


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# TF-IDF Model
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train)
predictions_tfidf = model_tfidf.predict(X_test_tfidf)
y_score_tfidf = model_tfidf.predict_proba(X_test_tfidf)

# Evaluation
auc_tfidf = roc_auc_score(y_test_binarized, y_score_tfidf, multi_class='ovr')
f1_macro_tfidf = f1_score(y_test, predictions_tfidf, average='macro')
f1_micro_tfidf = f1_score(y_test, predictions_tfidf, average='micro')

print(f"TF-IDF Model - AUROC (OvR): {auc_tfidf}, Macro F1: {f1_macro_tfidf}, Micro F1: {f1_micro_tfidf}")


TF-IDF Model - AUROC (OvR): 0.8993878326289192, Macro F1: 0.7208026136171498, Micro F1: 0.777319587628866
