In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
import pandas as pd

In [None]:
dev_data = pd.read_csv('/content/drive/MyDrive/SharedTask/coherence_data_dev.csv')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = tokenizer(list(dev_data['text']), padding=True, truncation=True, return_tensors='pt')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm
features = []
for i in tqdm(range(len(dev_data))):
  features.append([dev_data['coherence'].iloc[i],dev_data['complexity'].iloc[i],dev_data['length'].iloc[i] ])

features_tensor = torch.tensor(features, dtype=torch.float32)
labels_tensor = torch.tensor(dev_data.label, dtype=torch.long)

100%|██████████| 5000/5000 [00:00<00:00, 42144.24it/s]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Create a PyTorch dataset
dataset = TensorDataset(tokenized_texts['input_ids'], tokenized_texts['attention_mask'], features_tensor, labels_tensor)

In [None]:
class TextClassifier(nn.Module):
    def __init__(self):
        super(TextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(768 + len(features[0]), 2)  # 768 is the size of BERT hidden states

    def forward(self, input_ids, attention_mask, features):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        combined_features = torch.cat([pooled_output, features], dim=1)
        logits = self.fc(combined_features)
        return logits

In [None]:
model = TextClassifier().to(device)

# Specify the path to the saved model file (.pth)
model_path = '/content/drive/MyDrive/SharedTask/text_classifier_model_2Epochs.pth'

# Load the saved model
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint)

# It's also common to load other components, such as optimizer state, if needed
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']

# Set the model to evaluation mode
model.eval()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

TextClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [None]:
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

transformer_orig_output = []
transformer_pred_output = []
for batch in tqdm(dataloader):
  torch.cuda.empty_cache()
  input_ids, attention_mask, feat, labels = batch
  input_ids, attention_mask, feat, labels = input_ids.to(device), attention_mask.to(device), feat.to(device), labels.to(device)
  outputs = model(input_ids, attention_mask, feat)
  for i in range(len(outputs)):
    #print(outputs[i].argmax().item(), labels[i].item())
    transformer_orig_output.append(labels[i].item())
    transformer_pred_output.append(outputs[i].argmax().item())


100%|██████████| 1250/1250 [03:58<00:00,  5.24it/s]


In [None]:
train_data =  pd.read_csv("/content/drive/MyDrive/SharedTask/coherence_data_with_length_and_complexity.csv")

In [None]:
X = []
for i in range(len(train_data)):
  X.append([train_data['text'].iloc[i], train_data['length'].iloc[i], train_data['complexity'].iloc[i], train_data['coherence'].iloc[i]])

y = train_data['label']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import hstack

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform([t[0] for t in X_train])

In [None]:
dev_data =  pd.read_csv("/content/drive/MyDrive/SharedTask/coherence_data_dev.csv")

In [None]:
dev_data.head()

Unnamed: 0,text,model,label,source,id,complexity,length,coherence
0,It is based on how long it takes humans to sle...,bloomz,1,reddit,2123,13.0,246,15.130542
1,Michele Samantha Yi Wen Lean ) (born 1 Septemb...,human,0,wikipedia,1997,13.5,810,8.809902
2,This is a complex issue but I'll do my best to...,human,0,reddit,2549,9.9,221,4.756706
3,The Chekya-Byas people are the largest ethnic ...,bloomz,1,wikipedia,1187,11.5,161,11.214866
4,The paper deals with a very important issue of...,human,0,peerread,4712,10.0,208,-7.817893


In [None]:
dev_X = []
for i in range(len(dev_data)):
  dev_X.append([dev_data['text'].iloc[i], dev_data['length'].iloc[i], dev_data['complexity'].iloc[i], dev_data['coherence'].iloc[i]])

dev_y = dev_data['label']

In [None]:
dev_X_tfidf = vectorizer.transform([t[0] for t in dev_X])

In [None]:
dev_X_tfidf_modified = hstack([dev_X_tfidf, np.array([t[1] for t in dev_X]).reshape(-1, 1),np.array([t[2] for t in dev_X]).reshape(-1, 1),np.array([t[3] for t in dev_X]).reshape(-1, 1)])

In [None]:
lr_preds = []
bg_preds = []

In [None]:
import pickle

# Assuming 'model.pkl' is the file where your logistic regression model is saved
lr_model_filename = '/content/drive/MyDrive/SharedTask/Models2/LR2.pkl'
bg_model_file_name = '/content/drive/MyDrive/SharedTask/Models2/BG2.pkl'

# Load the model
with open(lr_model_filename, 'rb') as file:
    lr_model = pickle.load(file)


# Load the model
with open(bg_model_file_name, 'rb') as file:
    bg_model = pickle.load(file)

In [None]:
lr_preds = lr_model.predict(dev_X_tfidf_modified)

In [None]:
print(lr_preds)

[0 0 0 ... 1 0 0]


In [None]:
bg_preds = bg_model.predict(dev_X_tfidf_modified)

In [None]:
print(bg_preds)

[[0.78 0.22]
 [1.   0.  ]
 [0.94 0.06]
 ...
 [0.28 0.72]
 [0.94 0.06]
 [0.96 0.04]]


In [None]:
voted = []
for i in range(len(lr_preds)):
  a = transformer_pred_output[i]
  b = lr_preds[i]
  c = bg_preds[i]

  if a == 1 and b ==0 and c ==0:
    voted.append(1)
  elif a == 1 and b == 1 and c ==1:
    voted.append(1)
  elif a == 1 and b ==0 and c == 1:
    voted.append(1)
  elif a== 1 and b==1 and c== 0:
    voted.append(1)
  elif a == 0 and b == 0 and c == 0:
    voted.append(0)
  elif a == 0 and b ==1 and c == 0:
    voted.append(0)
  elif a == 0 and b ==0 and c == 1:
    voted.append(0)
  elif a == 0 and b == 1 and c == 1:
    voted.append(0)


  # print("Original Output: ", transformer_orig_output[i])
  # print(a, b, c)

In [None]:
print(len(voted))

5000


In [None]:
df =  pd.DataFrame({'Original':transformer_orig_output, 'Voted Prediction': voted})

In [None]:
df.head()

Unnamed: 0,Original,Voted Prediction
0,1,0
1,1,0
2,1,0
3,0,0
4,1,0


In [None]:
df[df['Original'] == df['Voted Prediction']]

Unnamed: 0,Original,Voted Prediction
3,0,0
5,0,0
8,0,0
11,0,0
12,1,1
...,...,...
4993,0,0
4994,0,0
4995,0,0
4996,0,0


In [None]:
3942/5000

0.7884

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
def get_metrics_score(preds, origs):
  prec = precision_score(transformer_orig_output, transformer_pred_output)
  recall = recall_score(transformer_orig_output, transformer_pred_output)
  f1 = f1_score(transformer_orig_output, transformer_pred_output)
  accuracy = accuracy_score(transformer_orig_output, transformer_pred_output)
  print(prec, recall, f1, accuracy)

In [None]:
get_metrics_score(transformer_pred_output, transformer_orig_output)

0.9633676092544987 0.5996 0.7391518737672583 0.7884


In [None]:
get_metrics_score(lr_preds, transformer_orig_output)

0.4861227922624054 0.2312 0.3133640552995392 0.4934


In [None]:
get_metrics_score(bg_preds, transformer_orig_output)

0.5013003901170351 0.3084 0.3818722139673105 0.5008
