In [142]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from numpy import random



In [143]:
random.seed(42)

data = pd.read_csv('data/CategorisedFakeNewsTweetsFinal.csv')

data.head()

Unnamed: 0,docID,majorityTarget,statement,binaryNumTarget,tweet,threeLabelMajority,fiveLabelMajority,primaryCat,primaryCatNum,secondaryCat,...,determiners,conjunctions,dots,exclamation,questions,ampersand,capitals,digits,long_word_freq,short_word_freq
0,0,True,End of eviction moratorium means millions of A...,1.0,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree,Services,3,"Housing, Politics",...,0,0,5,0,1,0,33,3,5,19
1,1,True,End of eviction moratorium means millions of A...,1.0,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree,Services,3,"Housing, Politics",...,0,2,1,0,0,0,14,0,2,34
2,2,True,End of eviction moratorium means millions of A...,1.0,THE SUPREME COURT is siding with super rich pr...,Agree,Agree,Services,3,"Housing, Politics",...,0,1,0,0,0,0,3,0,4,10
3,3,True,End of eviction moratorium means millions of A...,1.0,@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree,Services,3,"Housing, Politics",...,0,1,3,0,0,1,6,8,1,30
4,4,True,End of eviction moratorium means millions of A...,1.0,@OhComfy I agree. The confluence of events rig...,Agree,Agree,Services,3,"Housing, Politics",...,0,1,3,0,1,0,11,3,2,19


In [144]:
data = data.drop('majorityTarget', axis=1)

In [146]:
# group by statements
# will change when get subcategories
grouped = data.groupby('primaryCat')
train_data = pd.DataFrame()
test_data = pd.DataFrame()
random.seed(42)

i=0

for group_name, group_df in grouped:
    if random.random() <= 0.8:
        train_data = pd.concat([train_data, group_df])
    else:
        test_data = pd.concat([test_data, group_df])
    i += 1


In [149]:

train_statement_unique = set(train_data["primaryCat"].unique())
test_statement_unique = set(test_data["primaryCat"].unique())

common_elements = train_statement_unique.intersection(test_statement_unique)
if len(common_elements) != 0:
    print("Error: common elements between train and test data")
    print(len(common_elements))
else:
    print("No common elements between train and test data")

No common elements between train and test data


In [150]:
import torch as torch
import torch.nn as nn


In [151]:
params = {}
params['input_size'] = train_data.iloc[0].shape[0] - 1 #46 features after removing target
params['hidden_size'] = 100 # arbitrary
params['num_classes'] = 2 # binary classification, real or fake
params['num_epochs'] = 10
params['batch_size'] = 64
params['learning_rate'] = 0.001

In [153]:
class Net(nn.Module):
    def __init__(self, params):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(params['input_size'], params['hidden_size'])
        self.fc2 = nn.Linear(params['hidden_size'], params['hidden_size'])
        self.fc3 = nn.Linear(params['hidden_size'], params['num_classes'])

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.log_softmax(x, dim=1)
    

In [154]:
from torch.utils.data import Dataset, DataLoader

class FakeNewsDataset(Dataset):
    def __init__(self, features, target):
        self.features = torch.tensor(features)#, dtype=torch.float32)
        self.target = torch.tensor(target)#, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.target[index]


In [156]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


features_train = train_data.drop('binaryNumTarget', axis=1).values

# Handle string values using label encoding
label_encoder = LabelEncoder()
# Identify columns with string values (assuming dtype is 'str' or 'object')
string_columns = np.array([np.issubdtype(type(col), np.str_) or np.issubdtype(type(col), np.object_) for col in features_train[0]])

# Apply label encoding to string columns
for col_index in np.where(string_columns)[0]:
    features_train[:, col_index] = label_encoder.fit_transform(features_train[:, col_index].astype(str))

features_train = features_train.astype(np.float32)


# for i in range(len(features_train[0])):
#     print(i, type(features_train[1][i]))

target_train = train_data['binaryNumTarget'].values
train_torch_ds = FakeNewsDataset(features_train, target_train)
train_loader = DataLoader(train_torch_ds, batch_size=params['batch_size'], shuffle=True)




In [157]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

features_test = test_data.drop('binaryNumTarget', axis=1).values

# Handle string values using label encoding
label_encoder = LabelEncoder()
# Identify columns with string values (assuming dtype is 'str' or 'object')
string_columns = np.array([np.issubdtype(type(col), np.str_) or np.issubdtype(type(col), np.object_) for col in features_test[0]])

# Apply label encoding to string columns
for col_index in np.where(string_columns)[0]:
    features_test[:, col_index] = label_encoder.fit_transform(features_test[:, col_index].astype(str))

features_test = features_test.astype(np.float32)


# for i in range(len(features_train[0])):
#     print(i, type(features_train[1][i]))

target_test = test_data['binaryNumTarget'].values
test_torch_ds = FakeNewsDataset(features_test, target_test)
test_loader = DataLoader(test_torch_ds, batch_size=params['batch_size'], shuffle=True)




In [158]:
for i, e in enumerate(test_loader):
    print(e[0].shape)
    break

torch.Size([64, 52])


In [159]:
for i, e in enumerate(train_loader):
    print(i, e[0].shape, e[1].shape)
    break

0 torch.Size([64, 52]) torch.Size([64])


In [160]:
model = Net(params)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])


In [161]:

total_step = len(train_loader)
for epoch in range(params['num_epochs']):
    for i, (features, labels) in enumerate(train_loader):
        features = features.float()
        labels = labels.long()
        outputs = model(features)
        l = loss(outputs, labels)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        if (i+1) % 1000 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, params['num_epochs'], i+1, total_step, l.item()))

Epoch [1/10], Step [1000/1392], Loss: 3.2264
Epoch [2/10], Step [1000/1392], Loss: 4.0814
Epoch [3/10], Step [1000/1392], Loss: 2.0998
Epoch [4/10], Step [1000/1392], Loss: 0.4118
Epoch [5/10], Step [1000/1392], Loss: 0.3861
Epoch [6/10], Step [1000/1392], Loss: 0.3128
Epoch [7/10], Step [1000/1392], Loss: 0.2296
Epoch [8/10], Step [1000/1392], Loss: 0.2454
Epoch [9/10], Step [1000/1392], Loss: 0.3567
Epoch [10/10], Step [1000/1392], Loss: 0.3219


In [162]:
# eval model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for features, labels in test_loader:
        features = features.float()
        labels = labels.long()
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Train Accuracy of the model on the {} test tweets: {} %'.format(total, np.round(100 * correct / total, 2)))

Train Accuracy of the model on the 45146 test tweets: 78.68 %


In [163]:
data = pd.read_csv('data/CategorisedFakeNewsTweetsFinal.csv')
data = data.drop('majorityTarget', axis=1)
data2 = data.drop('statement', axis=1)
data2 = data2.drop('tweet', axis=1)
# Group the data by 'primaryCat'

grouped = data.groupby('primaryCat')
grouped2 = data2.groupby('primaryCat')

                         
datasets = {}
datasets2 = {}
datasets3 = {}
# Iterate over each group and create a separate dataset
for name, group in grouped:
    datasets[name] = group

for name, group in grouped2:
    datasets2[name] = group

data3 = data[['statement', 'tweet', 'primaryCat', 'binaryNumTarget']]
grouped3 = data3.groupby('primaryCat')

for name, group in grouped3:
    datasets3[name] = group

In [165]:
catagories = ['Services', 'Pandemic', 'Economy', 'Discord', 'Culture',
       'Elections', 'Environment', 'Industry']

In [167]:
def trainAndEvaluate(data):
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    params = {}
    params['input_size'] = train_data.iloc[0].shape[0] - 1#46 features after removing target
    params['hidden_size'] = 100 # arbitrary
    params['num_classes'] = 2 # binary classification, real or fake
    params['num_epochs'] = 10
    params['batch_size'] = 64
    params['learning_rate'] = 0.001

    class Net(nn.Module):
        def __init__(self, params):
            super(Net, self).__init__()
            self.fc1 = nn.Linear(params['input_size'], params['hidden_size'])
            self.fc2 = nn.Linear(params['hidden_size'], params['hidden_size'])
            self.fc3 = nn.Linear(params['hidden_size'], params['num_classes'])
    
        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return torch.log_softmax(x, dim=1)
    
    
    class FakeNewsDataset(Dataset):
        def __init__(self, features, target):
            self.features = torch.tensor(features)#, dtype=torch.float32)
            self.target = torch.tensor(target)#, dtype=torch.float32)
    
        def __len__(self):
            return len(self.features)
    
        def __getitem__(self, index):
            return self.features[index], self.target[index]


    features_train = train_data.drop('binaryNumTarget', axis=1).values

    # Handle string values using label encoding
    label_encoder = LabelEncoder()
    # Identify columns with string values (assuming dtype is 'str' or 'object')
    string_columns = np.array([np.issubdtype(type(col), np.str_) or np.issubdtype(type(col), np.object_) for col in features_train[0]])
    
    # Apply label encoding to string columns
    for col_index in np.where(string_columns)[0]:
        features_train[:, col_index] = label_encoder.fit_transform(features_train[:, col_index].astype(str))
    
    features_train = features_train.astype(np.float32)
    
    target_train = train_data['binaryNumTarget'].values
    train_torch_ds = FakeNewsDataset(features_train, target_train)
    train_loader = DataLoader(train_torch_ds, batch_size=params['batch_size'], shuffle=True)


    features_test = test_data.drop('binaryNumTarget', axis=1).values

    # Handle string values using label encoding
    label_encoder = LabelEncoder()
    # Identify columns with string values (assuming dtype is 'str' or 'object')
    string_columns = np.array([np.issubdtype(type(col), np.str_) or np.issubdtype(type(col), np.object_) for col in features_test[0]])
    
    # Apply label encoding to string columns
    for col_index in np.where(string_columns)[0]:
        features_test[:, col_index] = label_encoder.fit_transform(features_test[:, col_index].astype(str))
    
    features_test = features_test.astype(np.float32)
    
    target_test = test_data['binaryNumTarget'].values
    test_torch_ds = FakeNewsDataset(features_test, target_test)
    test_loader = DataLoader(test_torch_ds, batch_size=params['batch_size'], shuffle=True)

    model = Net(params)
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])

    total_step = len(train_loader)
    for epoch in range(params['num_epochs']):
        for i, (features, labels) in enumerate(train_loader):
            features = features.float()
            labels = labels.long()
            outputs = model(features)
            l = loss(outputs, labels)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            if (i+1) % 1000 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, params['num_epochs'], i+1, total_step, l.item()))
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for features, labels in test_loader:
            features = features.float()
            labels = labels.long()
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
        print('Train Accuracy of the model on the {} test tweets: {} %'.format(total, np.round(100 * correct / total, 2)))

    return model
    

In [169]:
print('Results for full dataset')
for x in catagories:
    print('Training on {} dataset'.format(x))
    model = trainAndEvaluate(datasets[x])

    

Results for full dataset
Training on Services dataset
Train Accuracy of the model on the 3251 test tweets: 84.25 %
Training on Pandemic dataset
Train Accuracy of the model on the 1812 test tweets: 95.92 %
Training on Economy dataset
Train Accuracy of the model on the 7229 test tweets: 54.02 %
Training on Discord dataset
Train Accuracy of the model on the 5779 test tweets: 49.23 %
Training on Culture dataset
Train Accuracy of the model on the 2612 test tweets: 63.02 %
Training on Elections dataset
Train Accuracy of the model on the 4395 test tweets: 81.14 %
Training on Environment dataset
Train Accuracy of the model on the 776 test tweets: 57.99 %
Training on Industry dataset
Train Accuracy of the model on the 988 test tweets: 81.17 %


In [170]:
print('Results without tweets and statement')
for x in catagories:
    print('Training on {} dataset'.format(x))
    model = trainAndEvaluate(datasets2[x])


Results without tweets and statement
Training on Services dataset
Train Accuracy of the model on the 3251 test tweets: 72.5 %
Training on Pandemic dataset
Train Accuracy of the model on the 1812 test tweets: 99.67 %
Training on Economy dataset
Train Accuracy of the model on the 7229 test tweets: 71.37 %
Training on Discord dataset
Train Accuracy of the model on the 5779 test tweets: 60.53 %
Training on Culture dataset
Train Accuracy of the model on the 2612 test tweets: 83.54 %
Training on Elections dataset
Train Accuracy of the model on the 4395 test tweets: 86.69 %
Training on Environment dataset
Train Accuracy of the model on the 776 test tweets: 42.4 %
Training on Industry dataset
Train Accuracy of the model on the 988 test tweets: 64.57 %


In [171]:
print('Results with only tweets and statement')
for x in catagories:
    print('Training on {} dataset'.format(x))
    model = trainAndEvaluate(datasets3[x])


Results with only tweets and statement
Training on Services dataset
Train Accuracy of the model on the 3251 test tweets: 31.04 %
Training on Pandemic dataset
Train Accuracy of the model on the 1812 test tweets: 76.66 %
Training on Economy dataset
Train Accuracy of the model on the 7229 test tweets: 61.59 %
Training on Discord dataset
Train Accuracy of the model on the 5779 test tweets: 67.99 %
Training on Culture dataset
Train Accuracy of the model on the 2612 test tweets: 58.88 %
Training on Elections dataset
Train Accuracy of the model on the 4395 test tweets: 77.16 %
Training on Environment dataset
Train Accuracy of the model on the 776 test tweets: 62.11 %
Training on Industry dataset
Train Accuracy of the model on the 988 test tweets: 68.52 %


In [172]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch.nn.functional as F

def prepare_data(data):
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    return train_data, test_data

def encode_features(features):
    label_encoder = LabelEncoder()
    for col_index in np.where([np.issubdtype(type(col), np.str_) or np.issubdtype(type(col), np.object_) for col in features[0]])[0]:
        features[:, col_index] = label_encoder.fit_transform(features[:, col_index].astype(str))
    return features.astype(np.float32)

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

class FakeNewsDataset(Dataset):
    def __init__(self, features, target):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.target[index]

def get_loaders(train_data, test_data, batch_size):
    features_train = encode_features(train_data.drop('binaryNumTarget', axis=1).values)
    target_train = train_data['binaryNumTarget'].values
    train_dataset = FakeNewsDataset(features_train, target_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    features_test = encode_features(test_data.drop('binaryNumTarget', axis=1).values)
    target_test = test_data['binaryNumTarget'].values
    test_dataset = FakeNewsDataset(features_test, target_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_loader, test_loader

def trainAndEvaluate2(data):
    train_data, test_data = prepare_data(data)
    params = {
        'input_size': train_data.iloc[0].shape[0] - 1,  # Assuming last column is target
        'hidden_size': 100,
        'num_classes': 2,
        'num_epochs': 10,
        'batch_size': 64,
        'learning_rate': 0.001
    }

    model = Net(params['input_size'], params['hidden_size'], params['num_classes'])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

    train_loader, test_loader = get_loaders(train_data, test_data, params['batch_size'])

    # Training
    model.train()
    for epoch in range(params['num_epochs']):
        for i, (features, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if (i+1) % 1000 == 0:
                print(f'Epoch [{epoch+1}/{params["num_epochs"]}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

    # Evaluation
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy of the model on the {total} test tweets: {accuracy:.2f} %')

    return model

In [173]:
print('Results with full dataset')
for x in catagories:
    print('Training on {} dataset'.format(x))
    model = trainAndEvaluate2(datasets[x])

    

Results with full dataset
Training on Services dataset
Test Accuracy of the model on the 3251 test tweets: 75.58 %
Training on Pandemic dataset
Test Accuracy of the model on the 1812 test tweets: 95.14 %
Training on Economy dataset
Test Accuracy of the model on the 7229 test tweets: 47.85 %
Training on Discord dataset
Test Accuracy of the model on the 5779 test tweets: 80.65 %
Training on Culture dataset
Test Accuracy of the model on the 2612 test tweets: 79.10 %
Training on Elections dataset
Test Accuracy of the model on the 4395 test tweets: 80.75 %
Training on Environment dataset
Test Accuracy of the model on the 776 test tweets: 57.60 %
Training on Industry dataset
Test Accuracy of the model on the 988 test tweets: 48.68 %


In [174]:
print('Results without tweets and statement')
for x in catagories:
    print('Training on {} dataset'.format(x))
    model = trainAndEvaluate2(datasets2[x])


Results without tweets and statement
Training on Services dataset
Test Accuracy of the model on the 3251 test tweets: 70.50 %
Training on Pandemic dataset
Test Accuracy of the model on the 1812 test tweets: 97.85 %
Training on Economy dataset
Test Accuracy of the model on the 7229 test tweets: 79.46 %
Training on Discord dataset
Test Accuracy of the model on the 5779 test tweets: 71.97 %
Training on Culture dataset
Test Accuracy of the model on the 2612 test tweets: 83.46 %
Training on Elections dataset
Test Accuracy of the model on the 4395 test tweets: 89.17 %
Training on Environment dataset
Test Accuracy of the model on the 776 test tweets: 57.60 %
Training on Industry dataset
Test Accuracy of the model on the 988 test tweets: 48.99 %


In [175]:
print('Results with only tweets and statement')
for x in catagories:
    print('Training on {} dataset'.format(x))
    model = trainAndEvaluate2(datasets3[x])


Results with only tweets and statement
Training on Services dataset
Test Accuracy of the model on the 3251 test tweets: 59.30 %
Training on Pandemic dataset
Test Accuracy of the model on the 1812 test tweets: 76.77 %
Training on Economy dataset
Test Accuracy of the model on the 7229 test tweets: 55.60 %
Training on Discord dataset
Test Accuracy of the model on the 5779 test tweets: 66.81 %
Training on Culture dataset
Test Accuracy of the model on the 2612 test tweets: 45.71 %
Training on Elections dataset
Test Accuracy of the model on the 4395 test tweets: 77.09 %
Training on Environment dataset
Test Accuracy of the model on the 776 test tweets: 60.57 %
Training on Industry dataset
Test Accuracy of the model on the 988 test tweets: 74.80 %
