-
Notifications
You must be signed in to change notification settings - Fork 0
/
classify.py
98 lines (71 loc) · 3.27 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# import transformers for bertclassifier
import torch
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import hf_hub_download
# import pandas as pd
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
tokenizer = AutoTokenizer.from_pretrained('rockerritesh/maiBERT_TF')
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
class BERTClass(torch.nn.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.roberta = AutoModel.from_pretrained('rockerritesh/maiBERT_TF', from_tf = True)
self.l2 = torch.nn.Dropout(0.5)
self.fc = torch.nn.Linear(768,10)
def forward(self, ids, mask, token_type_ids):
_, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
output_2 = self.l2(features)
output = self.fc(output_2)
return output
# Load the saved state dictionary
model_state_dict_path = hf_hub_download(repo_id="rockerritesh/maithili_classifier", filename="maibert.bin", repo_type="model")
model_state_dict = torch.load(model_state_dict_path,map_location=torch.device(device))
# Create an instance of your model
model = BERTClass()
# Load the state dictionary into the model
model.load_state_dict(model_state_dict)
model.to(device)
target_cols = ['Politics', 'Culture', 'Sports', 'Literature', 'Entertainment',
'Health', 'EduTech', 'Opinion', 'Interview', 'Economy']
#df = pd.read_csv('filename.csv')
# df.head()
def predict_using_maibert(df):
# Split the dataframe into batches (you can adjust the batch size)
batch_size = 32
batches = [df[i:i+batch_size] for i in range(0, len(df), batch_size)]
# Iterate through batches and perform inference
for batch_df in batches:
texts = batch_df['translated'].tolist()
# Tokenize and preprocess the batch of inputs
tokenized_batch = tokenizer.batch_encode_plus(
texts,
truncation=True,
add_special_tokens=True,
max_length=256,
padding='max_length',
return_token_type_ids=True,
return_tensors='pt' # Return PyTorch tensors
)
# Move tensors to the available device (GPU or CPU)
#device = "cuda" if torch.cuda.is_available() else "cpu"
input_ids = tokenized_batch['input_ids'].to(device)
attention_mask = tokenized_batch['attention_mask'].to(device)
token_type_ids = tokenized_batch['token_type_ids'].to(device)
model.eval()
# Perform inference
with torch.no_grad():
outputs = model(input_ids, attention_mask, token_type_ids)
# Process outputs
predicted_classes = torch.argmax(outputs, dim=1).tolist()
predicted_classes = [target_cols[idx] for idx in predicted_classes]
# Associate predictions with original DataFrame
batch_df['label'] = predicted_classes
# # Print or use the results as needed
# print(batch_df[['Text', 'Predicted_Class']])
break
return batch_df
# Save the results to a CSV file
# batch_df.to_csv('predictions.csv', index=False)
# category_out = predict_using_maibert(pd.read_csv('filename.csv'))
# print(category_out.head())