In [7]:
import pandas as pd
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack
import pickle

---
## 1. Loading the data
---

In [11]:
# Get the current directory of the notebook
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
csv_file_path_1 = os.path.join(notebook_dir, '../data/train_bodies.csv')
csv_file_path_2 = os.path.join(notebook_dir, '../data/train_stances.csv')

In [12]:
df_1 = pd.read_csv(csv_file_path_1)
df_2 = pd.read_csv(csv_file_path_2)
# Joining both dataset
data = pd.merge(df_1, df_2, on='Body ID', how='left')
duplicates_to_drop = data[data['Stance'] == 'unrelated'].duplicated()
# Invert the boolean Series to keep non-duplicate rows in 'unrelated' class
data_2 = data[~(duplicates_to_drop & (data['Stance'] == 'unrelated'))]
df_model = data_2.copy() # Making a copying of data_2
random_seed = 42
df_model_sample = df_model.sample(n=10000, random_state=random_seed)

In [13]:
# Separate features and labels
X_headline = df_model_sample['Headline']
X_article = df_model_sample['articleBody']
label_mapping = {'unrelated': 0, 'agree': 1, 'discuss': 2, 'disagree': 3}
y = df_model_sample['Stance'].map(label_mapping)

---
## 2. Functions to Clean the Data
---

In [14]:
# Text preprocessing functions
def     clean_text(text):

    text = text.lower()  
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    return text

In [15]:
def     tokenize_and_stem(text):
    
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')]
    return " ".join(tokens)

---
## 3. Splitting the dataset into train and test sets
---

In [16]:
# Split the dataset into train and test sets
X_headline_train, X_headline_test, X_article_train, X_article_test, y_train, y_test = train_test_split(
    X_headline, X_article, y, test_size=0.2, random_state=42
)

In [17]:
# Cleaning the dataset
X_headline_train_cleaned = X_headline_train.apply(clean_text)
X_article_train_cleaned = X_article_train.apply(clean_text)
X_headline_test_cleaned = X_headline_test.apply(clean_text)
X_article_test_cleaned = X_article_test.apply(clean_text)

In [18]:
# Cleaning still
X_headline_train_preprocessed = X_headline_train_cleaned.apply(tokenize_and_stem)
X_article_train_preprocessed = X_article_train_cleaned.apply(tokenize_and_stem)

In [19]:
# Cleaning
X_headline_test_preprocessed = X_headline_test_cleaned.apply(tokenize_and_stem)
X_article_test_preprocessed = X_article_test_cleaned.apply(tokenize_and_stem)

---
## 4. Text Preprocessing
---

In [20]:
# Initialize TF-IDF vectorizers for headline and article body
tfidf_headline = TfidfVectorizer(max_features=3638, ngram_range=(1, 2))
tfidf_article = TfidfVectorizer(max_features=27336, ngram_range=(1, 2))

X_headline_train_tfidf = tfidf_headline.fit_transform(X_headline_train_preprocessed)
X_article_train_tfidf = tfidf_article.fit_transform(X_article_train_preprocessed)
X_headline_test_tfidf = tfidf_headline.transform(X_headline_test_preprocessed)
X_article_test_tfidf = tfidf_article.transform(X_article_test_preprocessed)

---
## 5. Data Augmentation using SMOTE
---


In [25]:
# Data Augmentation using SMOTE
smote_headline = SMOTE(sampling_strategy='auto', random_state=42)
X_headline_train_tfidf_resampled, y_headline_train_resampled = smote_headline.fit_resample(X_headline_train_tfidf, y_train)

smote_article = SMOTE(sampling_strategy='auto', random_state=42)
X_article_train_tfidf_resampled, y_article_train_resampled = smote_article.fit_resample(X_article_train_tfidf, y_train)

---
## 6. Combine Features
---

In [26]:
# Combine Features
X_train_combined = hstack((X_headline_train_tfidf_resampled, X_article_train_tfidf_resampled))
X_test_combined = hstack((X_headline_test_tfidf, X_article_test_tfidf))

---
## 7. Model
---

In [27]:
lr_model = LogisticRegression(max_iter=10000, class_weight='balanced', solver='saga', C=10, multi_class='multinomial')
lr_model.fit(X_train_combined, y_headline_train_resampled)

y_pred = lr_model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.747
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.81      0.84      1488
           1       0.39      0.50      0.44       147
           2       0.54      0.60      0.57       333
           3       0.31      0.47      0.38        32

    accuracy                           0.75      2000
   macro avg       0.53      0.60      0.55      2000
weighted avg       0.77      0.75      0.76      2000



In [28]:
rf_model = RandomForestClassifier(n_estimators=600, class_weight='balanced', random_state=42, criterion='entropy',
                                 max_depth=40, min_samples_leaf=2)

rf_model.fit(X_train_combined, y_headline_train_resampled)

y_pred = rf_model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.793
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88      1488
           1       0.37      0.30      0.33       147
           2       0.68      0.53      0.60       333
           3       0.29      0.28      0.29        32

    accuracy                           0.79      2000
   macro avg       0.55      0.51      0.52      2000
weighted avg       0.78      0.79      0.78      2000



In [29]:
dt_clf = DecisionTreeClassifier(max_depth=60, random_state=42, class_weight='balanced', criterion='entropy',
                                min_samples_split=4)
dt_clf.fit(X_train_combined, y_headline_train_resampled)

y_pred = dt_clf.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Decision Tree Classification Report:\n", classification_rep)

Accuracy: 0.8415
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      1488
           1       0.46      0.39      0.42       147
           2       0.73      0.66      0.69       333
           3       0.57      0.38      0.45        32

    accuracy                           0.84      2000
   macro avg       0.66      0.59      0.62      2000
weighted avg       0.83      0.84      0.84      2000



In [30]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(max_depth=40, n_estimators=600, learning_rate=0.1, random_state=42)

xgb_model.fit(X_train_combined, y_headline_train_resampled)
y_pred = xgb_model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.881
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.97      0.94      1488
           1       0.61      0.44      0.51       147
           2       0.86      0.73      0.79       333
           3       0.52      0.34      0.42        32

    accuracy                           0.88      2000
   macro avg       0.73      0.62      0.66      2000
weighted avg       0.87      0.88      0.87      2000



In [None]:

# Save the trained models
#with open('Log_reg_model.pkl', 'wb') as f:
#    pickle.dump(lr_model, f)

#with open('RF_clf.pkl', 'wb') as f:
#    pickle.dump(rf_model, f)

#with open('DT_clf.pkl', 'wb') as f:
#    pickle.dump(dt_clf, f)

#with open('xgb_model.pkl', 'wb') as f:
#    pickle.dump(xgb_model, f)