# Modelling

In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('Data/clean_data.csv')

df

Unnamed: 0,sentiment,raw_comm,nouns_comm
0,neutral,Technopolis plans to develop in stages an area...,Technopolis stages area meters order companies...
1,negative,The international electronic industry company ...,industry company tens employees facility layof...
2,positive,With the new production plant the company woul...,production plant company capacity increase dem...
3,positive,According to the company 's updated strategy f...,company strategy years term sales growth range...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,FINANCING growth strategy circuit boards PCBs
...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,Share prices rebound bank stocks weakness
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,beer sales per cent litres beer sales cent litres
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,profit mn mn vessel sales gain mn
4843,negative,Net sales of the Paper segment decreased to EU...,sales segment mn quarter mn quarter profit ite...


### Split the data

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['sentiment'], random_state=42)

# Display the class distribution in the training set before downsampling
print("Training set class distribution before downsampling:")
print(train_df['sentiment'].value_counts())

# Display the class distribution in the test set
print("Test set class distribution:")
print(test_df['sentiment'].value_counts())

Training set class distribution before downsampling:
neutral     2302
positive    1091
negative     483
Name: sentiment, dtype: int64
Test set class distribution:
neutral     576
positive    272
negative    121
Name: sentiment, dtype: int64


### Down sampling

In [6]:
# Determine the size of the smallest class in the training set
min_class_size = train_df['sentiment'].value_counts().min()

# Downsample each sentiment class in the training set
train_positive = train_df[train_df['sentiment'] == 'positive'].sample(min_class_size, random_state=42)
train_negative = train_df[train_df['sentiment'] == 'negative'].sample(min_class_size, random_state=42)
train_neutral = train_df[train_df['sentiment'] == 'neutral'].sample(min_class_size, random_state=42)

# Combine the downsampled dataframes
train_df_downsampled = pd.concat([train_positive, train_negative, train_neutral])

# Shuffle the combined dataframe to mix the classes
train_df_downsampled = train_df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the class distribution after downsampling in the training set
print("Training set class distribution after downsampling:")
print(train_df_downsampled['sentiment'].value_counts())


Training set class distribution after downsampling:
negative    483
positive    483
neutral     483
Name: sentiment, dtype: int64


### TF-IDF Vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill NaN values in 'nouns_comm' and 'raw_comm' columns with an empty string
train_df_downsampled['nouns_comm'] = train_df_downsampled['nouns_comm'].fillna('')
train_df_downsampled['raw_comm'] = train_df_downsampled['raw_comm'].fillna('')
test_df['nouns_comm'] = test_df['nouns_comm'].fillna('')
test_df['raw_comm'] = test_df['raw_comm'].fillna('')

# TF-IDF Vectorization for 'nouns_comm' and 'raw_comm'
tfidf_vectorizer_nouns = TfidfVectorizer(max_features=2000)
tfidf_vectorizer_raw = TfidfVectorizer(max_features=2000)

X_train_nouns = tfidf_vectorizer_nouns.fit_transform(train_df_downsampled['nouns_comm'])
X_test_nouns = tfidf_vectorizer_nouns.transform(test_df['nouns_comm'])

X_train_raw = tfidf_vectorizer_raw.fit_transform(train_df_downsampled['raw_comm'])
X_test_raw = tfidf_vectorizer_raw.transform(test_df['raw_comm'])

y_train = train_df_downsampled['sentiment']
y_test = test_df['sentiment']


## Naive Bayes Model

### Train the raw and nouns model

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Train Naive Bayes models for both 'nouns_comm' and 'raw_comm'
nb_nouns = MultinomialNB(alpha=0.1)
nb_raw = MultinomialNB(alpha=0.1)

# Fit the models
nb_nouns.fit(X_train_nouns, y_train)
nb_raw.fit(X_train_raw, y_train)

# Predict on the training set
y_train_pred_nouns = nb_nouns.predict(X_train_nouns)
y_train_pred_raw = nb_raw.predict(X_train_raw)

# Predict on the test set
y_test_pred_nouns = nb_nouns.predict(X_test_nouns)
y_test_pred_raw = nb_raw.predict(X_test_raw)


### Compute models' accuracy

In [21]:
# Evaluate the Naive Bayes model on the training set ('nouns_comm')
print("Training Set - Naive Bayes Model on 'nouns_comm'")
print("Accuracy:", accuracy_score(y_train, y_train_pred_nouns))
print("Classification Report:\n", classification_report(y_train, y_train_pred_nouns))

# Evaluate the Naive Bayes model on the test set ('nouns_comm')
print("\nTest Set - Naive Bayes Model on 'nouns_comm'")
print("Accuracy:", accuracy_score(y_test, y_test_pred_nouns))
print("Classification Report:\n", classification_report(y_test, y_test_pred_nouns))

# Evaluate the Naive Bayes model on the training set ('raw_comm')
print("\nTraining Set - Naive Bayes Model on 'raw_comm'")
print("Accuracy:", accuracy_score(y_train, y_train_pred_raw))
print("Classification Report:\n", classification_report(y_train, y_train_pred_raw))

# Evaluate the Naive Bayes model on the test set ('raw_comm')
print("\nTest Set - Naive Bayes Model on 'raw_comm'")
print("Accuracy:", accuracy_score(y_test, y_test_pred_raw))
print("Classification Report:\n", classification_report(y_test, y_test_pred_raw))


Training Set - Naive Bayes Model on 'nouns_comm'
Accuracy: 0.855072463768116
Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.92      0.84       483
     neutral       0.92      0.88      0.90       483
    positive       0.90      0.76      0.83       483

    accuracy                           0.86      1449
   macro avg       0.86      0.86      0.86      1449
weighted avg       0.86      0.86      0.86      1449


Test Set - Naive Bayes Model on 'nouns_comm'
Accuracy: 0.5593395252837977
Classification Report:
               precision    recall  f1-score   support

    negative       0.31      0.73      0.44       121
     neutral       0.80      0.57      0.67       576
    positive       0.45      0.45      0.45       272

    accuracy                           0.56       969
   macro avg       0.52      0.58      0.52       969
weighted avg       0.64      0.56      0.58       969


Training Set - Naive Bayes Model on '