In [1]:
import numpy as np
import pandas as pd
from numpy import random

import torch
import torch.nn as nn

from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [2]:
data = pd.read_csv('data/CategorisedFakeNewsTweetsFinal.csv')
data.drop("URLs", axis=1)
data.drop('majorityTarget', axis=1, inplace=True)

meta_features = ["followers_count", "friends_count","favourites_count","statuses_count","listed_count","following","embeddings","BotScore",	"BotScoreBinary","cred","normalize_influence","mentions","quotes","replies","retweets","favourites","URLs"]
print(np.intersect1d(meta_features, data.columns))
print(data.dtypes)

['URLs']
docID                       int64
statement                  object
binaryNumTarget           float64
tweet                      object
threeLabelMajority         object
fiveLabelMajority          object
primaryCat                 object
primaryCatNum               int64
secondaryCat               object
secondaryCatNum             int64
hashtags                  float64
URLs                      float64
unique_count                int64
total_count                 int64
ORG_percentage            float64
NORP_percentage           float64
GPE_percentage            float64
PERSON_percentage         float64
MONEY_percentage          float64
DATE_percentage           float64
CARDINAL_percentage       float64
PERCENT_percentage        float64
ORDINAL_percentage        float64
FAC_percentage            float64
LAW_percentage            float64
PRODUCT_percentage        float64
EVENT_percentage          float64
TIME_percentage           float64
LOC_percentage            float64
WORK_

In [3]:
features_train = data.drop('binaryNumTarget', axis=1).values
targets_train = data['binaryNumTarget'].values

string_columns = np.array([np.issubdtype(type(col), np.str_) or np.issubdtype(type(col), object) for col in features_train[0]])

# Create a label encoder for the string columns
label_encoder = LabelEncoder()

# Apply label encoding to string columns
for col_index in np.where(string_columns)[0]:
    features_train[:, col_index] = label_encoder.fit_transform(features_train[:, col_index].astype(str))


#print datatypes of columns in features_train
features_train = features_train.astype(np.float32)
targets_train = targets_train.astype(np.float32)

In [4]:
# class TransformerModel(nn.Module):
#     def __init__(self, params):#input_size, hidden_size, output_size, num_heads, num_layers):
#         super(TransformerModel, self).__init__()
#         self.transformer = nn.Transformer(
#             d_model=params['input_size'],
#             nhead=['num_heads'],
#             num_encoder_layers=['num_layers'],
#             num_decoder_layers=['num_layers'],
#         )
#         self.fc = nn.Linear(params['hidden_size'], params['output_size'])

#     def forward(self, src, tgt):
#         output = self.transformer(src, tgt)
#         output = self.fc(output)
#         return output


In [5]:
class TransformerClassifier(nn.Module):
    # def __init__(self, input_size, hidden_size, num_heads, num_layers, num_classes, dropout):
    def __init__(self, params):
        super(TransformerClassifier, self).__init__()
        self.transformer = nn.Transformer(
            d_model=params['input_size'],
            nhead=params['num_heads'],
            num_encoder_layers=params['num_layers'],
            num_decoder_layers=params['num_layers'],
        )
        self.fc = nn.Linear(params['input_size'], params['hidden_size'])
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(params['dropout'])
        self.output_layer = nn.Linear(params['hidden_size'], params['num_classes'])

    def forward(self, x):
        x = self.transformer(x, x)
        # x = x.mean(dim=1)  # Aggregate over sequence length (mean pooling)
        x = self.fc(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        return x


In [6]:
params = {}
params['input_size'] = 52
params['hidden_size'] = 256
params['num_classes'] = 1
params['num_heads'] = 4
params['num_layers'] = 6  
assert params['input_size'] % params['num_heads'] == 0, "Input size must be divisible by the number of heads"
params['dropout'] = 0.1
params['batch_size'] = 128
params['num_epochs'] = 2
params['learning_rate'] = 0.001
params['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [7]:
from torch.utils.data import Dataset, DataLoader

class FakeNewsDataset(Dataset):
    def __init__(self, features, target):
        self.features = torch.tensor(features)#, dtype=torch.float32)
        self.target = torch.tensor(target)#, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.target[index]


In [8]:
train_torch_ds = FakeNewsDataset(features_train, targets_train)
train_loader = DataLoader(train_torch_ds, batch_size=params['batch_size'], shuffle=True)


In [9]:
model = TransformerClassifier(params)
loss= nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])




In [10]:

total_step = len(train_loader)
for epoch in range(params['num_epochs']):
    for i, (features, labels) in enumerate(train_loader):
        features = features.float()
        labels = labels.long()
        # print(features.shape, labels.shape)
        outputs = model(features)
        print(outputs.reshape(labels.shape).reshape(-1,1).shape, labels.reshape(-1,1).shape)
        l = loss(outputs.reshape(-1,1), labels.reshape(-1,1))
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        if (i+1) % 1000 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, params['num_epochs'], i+1, total_step, l.item()))

torch.Size([128, 1]) torch.Size([128, 1])


RuntimeError: Expected floating point type for target with class probabilities, got Long