In [33]:
import pandas as pd
import numpy as np

In [38]:
train_set = pd.read_csv('dataset/train_processed.csv')
train_set.head()

Unnamed: 0,Text,label,clean_text,label_num
0,WASHINGTON (Reuters) - The head of a conservat...,Real,washington reuters head conservative republica...,1
1,WASHINGTON (Reuters) - Transgender people will...,Real,washington reuters transgender people allowed ...,1
2,WASHINGTON (Reuters) - The special counsel inv...,Real,washington reuters special counsel investigati...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,Real,washington reuters trump campaign adviser geor...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,Real,seattlewashington reuters president donald tru...,1


In [41]:
train_set.dropna(inplace=True)

In [34]:
test_set = pd.read_csv('dataset/test_processed.csv')
test_set.head()

Unnamed: 0,Text,label,clean_text,label_num
0,"((In March 30 item, corrects spelling of Kisl...",Real,march 30 item corrects spelling kislyak paragr...,1
1,((Refiles December 15 story to clarify areas ...,Real,refiles december 15 story clarify area control...,1
2,((This Dec. 5 story corrects year in 2nd para...,Real,dec 5 story corrects year 2nd paragraph 2011 2...,1
3,((This Dec. 9 story corrects year in 2nd para...,Real,dec 9 story corrects year 2nd paragraph 2011 2...,1
4,((This December 4 story has been corrected to...,Real,december 4 story corrected change last year 20...,1


In [42]:
test_set.dropna(inplace=True)

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
vectorizer = TfidfVectorizer(max_features=15000, stop_words='english', ngram_range=(1, 3))

In [47]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### TF-IDF

In [46]:
tfidf_X_train = vectorizer.fit_transform(train_set['clean_text'])
tfidf_X_test = vectorizer.fit_transform(test_set['clean_text'])

In [69]:
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [79]:
train_features = torch.FloatTensor(tfidf_X_train.toarray())
train_labels = torch.LongTensor(train_set['label_num'].values)
test_features = torch.FloatTensor(tfidf_X_test.toarray())
test_labels = torch.LongTensor(test_set['label_num'].values)

In [80]:
train_dataset = TensorDataset(train_features, train_labels)
test_dataset = TensorDataset(test_features, test_labels)

In [81]:
# Split the training data into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    train_dataset, [train_size, val_size]
)


In [82]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [83]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_f1 = -np.inf
    
    def __call__(self, current_f1):
        if current_f1 > self.best_f1 + self.min_delta:
            self.best_f1 = current_f1
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

early_stopper = EarlyStopping()

In [None]:
# Define the model
class TextClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.layers(x).squeeze()

In [85]:
model = TextClassifier(input_dim=train_features.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()

In [None]:
epochs = 50
best_f1 = 0

for epoch in range(epochs):
    # train
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]", leave=False)
    
    for batch in progress_bar:
        features, labels = batch
        features = features.to(device)
        labels = labels.float().to(device)
        
        optimizer.zero_grad()
        
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    avg_train_loss = total_loss / len(train_loader)
    
    # evaluate
    model.eval()
    val_preds = []
    val_true = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Validation]", leave=False):
            features, labels = batch
            features = features.to(device)
            
            outputs = model(features)
            preds = (torch.sigmoid(outputs) > 0.5).long().cpu().numpy()
            
            val_preds.extend(preds)
            val_true.extend(labels.cpu().numpy())
    
    val_acc = accuracy_score(val_true, val_preds)
    val_f1 = f1_score(val_true, val_preds, zero_division=0)
    
    # early stopping
    if early_stopper(val_f1):
        print(f"\nEarly stopping at epoch {epoch+1}!")
        break
    
    print(f"\nEpoch {epoch+1}")
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Validation F1: {val_f1:.4f}\n")

Epoch 1 [Train]:   0%|          | 0/2073 [00:00<?, ?it/s]

                                                                                 


Epoch 1
Train Loss: 0.1482
Validation Accuracy: 0.9938
Validation F1: 0.9940



                                                                                 


Epoch 2
Train Loss: 0.0156
Validation Accuracy: 0.9974
Validation F1: 0.9975



                                                                                 


Epoch 3
Train Loss: 0.0062
Validation Accuracy: 0.9983
Validation F1: 0.9984



                                                                                 


Epoch 4
Train Loss: 0.0027
Validation Accuracy: 0.9990
Validation F1: 0.9990



                                                                                 


Epoch 5
Train Loss: 0.0014
Validation Accuracy: 0.9992
Validation F1: 0.9992



                                                                                 


Epoch 6
Train Loss: 0.0012
Validation Accuracy: 0.9993
Validation F1: 0.9994



                                                                                 


Epoch 7
Train Loss: 0.0009
Validation Accuracy: 0.9993
Validation F1: 0.9994



                                                                                 


Epoch 8
Train Loss: 0.0007
Validation Accuracy: 0.9992
Validation F1: 0.9992



                                                                                 


Early stopping at epoch 9!




In [87]:
# Test the model
model.eval()
test_preds = []
test_true = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        features, labels = batch
        features = features.to(device)
        
        outputs = model(features)
        preds = (torch.sigmoid(outputs) > 0.5).long().cpu().numpy()
        
        test_preds.extend(preds)
        test_true.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_true, test_preds)
test_f1 = f1_score(test_true, test_preds, zero_division=0)
test_precision = precision_score(test_true, test_preds, zero_division=0)
test_recall = recall_score(test_true, test_preds, zero_division=0)

print(f"Final Test Accuracy: {test_acc:.4f}")
print(f"Final Test Precision: {test_precision:.4f}")
print(f"Final Test Recall: {test_recall:.4f}")
print(f"Final Test F1 Score: {test_f1:.4f}")

Testing:   0%|          | 0/1208 [00:00<?, ?it/s]

Testing: 100%|██████████| 1208/1208 [00:12<00:00, 95.81it/s] 


Final Test Accuracy: 0.5290
Final Test Precision: 0.6687
Final Test Recall: 0.2792
Final Test F1 Score: 0.3939


### Word2Vec

In [None]:
def text_vectorize(ls):
    sumup = np.zeros(300)
    vector_values = []
    for i in ls:
        try:
            word_vector = wv[i]
            vector_values.append(word_vector)
        except KeyError:
            pass
    for k in vector_values:
        sumup += k
    sen_vector = sumup/len(vector_values)
    return sen_vector

In [None]:
def w2v_process(column):
  output = []
  for i in range(len(column)):
    tokens = column[i].split()
    vector = text_vectorize(tokens)
    output.append(vector)
  return output

In [None]:
train_w2v = w2v_process(train_set['clean_text'])
test_w2v = w2v_process(test_set['clean_text'])

  sen_vector = sumup/len(vector_values)


In [None]:
train_set['w2v'] = train_w2v
test_set['w2v'] = test_w2v

In [None]:
train_set['nan_value'] = train_set['w2v'] == np.NaN

In [None]:
train_set['nan_value'].value_counts()

Unnamed: 0_level_0,count
nan_value,Unnamed: 1_level_1
False,83539


In [None]:
test_set['nan_value'] = test_set['w2v'] == np.NaN

In [None]:
test_set['nan_value'].value_counts()

Unnamed: 0_level_0,count
nan_value,Unnamed: 1_level_1
False,38654


### BERT

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
from transformers import TFBertForSequenceClassification, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_set['clean_text'], train_set['label_num'], test_size=0.1, random_state=42
)

In [None]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)

In [None]:
import tensorflow as tf
from transformers import TFTrainer, TFTrainingArguments

# Change to TensorFlow Dataset Fomat
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).batch(8)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(8)

training_args = TFTrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
)

In [None]:
trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

## **Model Training**
*   LogisticRegression
*   XGBClassifier
*   Neural Network with TensorFlow

In [None]:
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

LogisticRegression - TF-IDF

In [None]:
lr_model_tfidf = LogisticRegression(max_iter=5000)
lr_model_tfidf.fit(tfidf_X_train, train_set['label_num'])

In [None]:
y_pred = lr_model_tfidf.predict(tfidf_X_test)
print(f"Accuracy: {metrics.accuracy_score(test_set['label_num'], y_pred)}")
print(f"F1 Score: {metrics.f1_score(test_set['label_num'], y_pred)}")
print(f"Precision: {metrics.precision_score(test_set['label_num'], y_pred)}")
print(f"Recall: {metrics.recall_score(test_set['label_num'], y_pred)}")

Accuracy: 0.46031458581259377
F1 Score: 0.037465971485258154
Precision: 0.8423236514522822
Recall: 0.019159076966636778


LogisticRegression - Word2Vec

In [None]:
lr_model_w2v = LogisticRegression(max_iter=5000)
train_w2v_array = np.vstack(train_set['w2v'].to_numpy())
train_w2v_array.shape

(83539, 300)

In [None]:
nan_indices = np.where(np.isnan(train_w2v_array))
nan_indices

(array([ 8970,  8970,  8970, ..., 72992, 72992, 72992]),
 array([  0,   1,   2, ..., 297, 298, 299]))

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_w2v_array_imputed = imputer.fit_transform(train_w2v_array)

In [None]:
lr_model_w2v.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
test_w2v_array = np.vstack(test_set['w2v'].to_numpy())
test_w2v_array_imputed = imputer.transform(test_w2v_array)
y_pred = lr_model_w2v.predict(test_w2v_array_imputed)
print(f"Accuracy: {metrics.accuracy_score(test_set['label_num'], y_pred)}")
print(f"F1 Score: {metrics.f1_score(test_set['label_num'], y_pred)}")
print(f"Precision: {metrics.precision_score(test_set['label_num'], y_pred)}")
print(f"Recall: {metrics.recall_score(test_set['label_num'], y_pred)}")

Accuracy: 0.9637812386816371
F1 Score: 0.9670495198644323
Precision: 0.9646429074517537
Recall: 0.9694681704497192


LogisticRegression - Model Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

TF-IDF Tuning

In [None]:
grid_search_tfidf = GridSearchCV(estimator=lr_model_tfidf, param_grid=param_grid, cv=5, scoring='f1')
grid_search_tfidf.fit(tfidf_X_train, train_set['label_num'])

In [None]:
print("Best parameters found: ", grid_search_tfidf.best_params_)

Best parameters found:  {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [None]:
lr_model_tfidf_tuned = LogisticRegression(max_iter=5000, C=100, penalty='l1', solver='liblinear')
lr_model_tfidf_tuned.fit(tfidf_X_train, train_set['label_num'])

In [None]:
y_pred_tuned = lr_model_tfidf_tuned.predict(tfidf_X_test)
print(f"Accuracy Tuned: {metrics.accuracy_score(test_set['label_num'], y_pred_tuned)}")
print(f"F1 Score Tuned: {metrics.f1_score(test_set['label_num'], y_pred_tuned)}")
print(f"Precision Tuned: {metrics.precision_score(test_set['label_num'], y_pred_tuned)}")
print(f"Recall Tuned: {metrics.recall_score(test_set['label_num'], y_pred_tuned)}")

Accuracy Tuned: 0.4754488539349097
F1 Score Tuned: 0.19424574789381657
Precision Tuned: 0.6151522778756607
Recall Tuned: 0.11533198055778396


Word2Vec Tuning

In [None]:
grid_search_w2v = GridSearchCV(estimator=lr_model_w2v, param_grid=param_grid, cv=5, scoring='f1')
grid_search_w2v.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
print("Best parameters found: ", grid_search_w2v.best_params_)

In [None]:
lr_model_w2v_tuned = LogisticRegression(max_iter=5000, C=100, penalty='l1', solver='liblinear')
lr_model_w2v_tuned.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
y_pred_tuned = lr_model_w2v_tuned.predict(test_w2v_array_imputed)
print(f"Accuracy Tuned: {metrics.accuracy_score(test_set['label_num'], y_pred_tuned)}")
print(f"F1 Score Tuned: {metrics.f1_score(test_set['label_num'], y_pred_tuned)}")
print(f"Precision Tuned: {metrics.precision_score(test_set['label_num'], y_pred_tuned)}")
print(f"Recall Tuned: {metrics.recall_score(test_set['label_num'], y_pred_tuned)}")

XGBClassifier - TF-IDF

In [None]:
from xgboost import XGBClassifier
xgb_model_tfidf = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model_tfidf.fit(tfidf_X_train, train_set['label_num'])

In [None]:
y_pred = xgb_model_tfidf.predict(tfidf_X_test)
print(f"Accuracy: {metrics.accuracy_score(test_set['label_num'], y_pred)}")
print(f"F1 Score: {metrics.f1_score(test_set['label_num'], y_pred)}")
print(f"Precision: {metrics.precision_score(test_set['label_num'], y_pred)}")
print(f"Recall: {metrics.recall_score(test_set['label_num'], y_pred)}")

Accuracy: 0.46173746572152946
F1 Score: 0.03773933956155767
Precision: 0.9466357308584686
Recall: 0.019253456656127602


XGBClassifier - Word2Vec

In [None]:
xgb_model_w2v = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model_w2v.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
y_pred = xgb_model_w2v.predict(test_w2v_array_imputed)
print(f"Accuracy: {metrics.accuracy_score(test_set['label_num'], y_pred)}")
print(f"F1 Score: {metrics.f1_score(test_set['label_num'], y_pred)}")
print(f"Precision: {metrics.precision_score(test_set['label_num'], y_pred)}")
print(f"Recall: {metrics.recall_score(test_set['label_num'], y_pred)}")

Accuracy: 0.987012987012987
F1 Score: 0.9881407984880699
Precision: 0.9893561663276409
Recall: 0.9869284130055213


XGBClassifier - Model Tuning

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 1.0],
    'tree_method': ['hist'],
    'device': ['cuda:0']
}

TF-IDF Tuning

In [None]:
grid_search_tfidf = GridSearchCV(
    estimator=xgb_model_tfidf,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [None]:
grid_search_tfidf.fit(tfidf_X_train, train_set['label_num'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits




In [None]:
print("Best parameters found: ", grid_search_tfidf.best_params_)

Best parameters found:  {'device': 'cuda:0', 'max_depth': 6, 'n_estimators': 50, 'subsample': 0.8, 'tree_method': 'hist'}


In [None]:
xgb_model_tfidf_tuned = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=50,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model_tfidf_tuned.fit(tfidf_X_train, train_set['label_num'])

In [None]:
y_pred_tuned = xgb_model_tfidf_tuned.predict(tfidf_X_test)
print(f"Accuracy Tuned: {metrics.accuracy_score(test_set['label_num'], y_pred_tuned)}")
print(f"F1 Score Tuned: {metrics.f1_score(test_set['label_num'], y_pred_tuned)}")
print(f"Precision Tuned: {metrics.precision_score(test_set['label_num'], y_pred_tuned)}")
print(f"Recall Tuned: {metrics.recall_score(test_set['label_num'], y_pred_tuned)}")

Accuracy Tuned: 0.46173746572152946
F1 Score Tuned: 0.03773933956155767
Precision Tuned: 0.9466357308584686
Recall Tuned: 0.019253456656127602


Word2Vec - Tuning

In [None]:
grid_search_w2v = GridSearchCV(
    estimator=xgb_model_w2v,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [None]:
grid_search_w2v.fit(train_w2v_array_imputed, train_set['label_num'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
print("Best parameters found: ", grid_search_w2v.best_params_)

Best parameters found:  {'device': 'cuda:0', 'max_depth': 9, 'n_estimators': 200, 'subsample': 0.8, 'tree_method': 'hist'}


In [None]:
xgb_model_w2v_tuned = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=200,
    max_depth=9,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model_w2v_tuned.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
y_pred_tuned = xgb_model_w2v_tuned.predict(test_w2v_array_imputed)
print(f"Accuracy Tuned: {metrics.accuracy_score(test_set['label_num'], y_pred_tuned)}")
print(f"F1 Score Tuned: {metrics.f1_score(test_set['label_num'], y_pred_tuned)}")
print(f"Precision Tuned: {metrics.precision_score(test_set['label_num'], y_pred_tuned)}")
print(f"Recall Tuned: {metrics.recall_score(test_set['label_num'], y_pred_tuned)}")

Accuracy Tuned: 0.9988616960728515
F1 Score Tuned: 0.9989607444848599
Precision Tuned: 1.0
Recall Tuned: 0.9979236468312019
