In [17]:
import time
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import emoji

In [26]:
# Load tokenizer and model
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Check GPU availability
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)  # Distribute model across all GPUs

# Move model to CUDA
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Put model in inference mode

# Optional: Use Torch Compile for Speedup (Only for PyTorch 2.0+)
# model = torch.compile(model)

print("Model is ready on:", device)

Using 4 GPUs!
Model is ready on: cuda:3


In [7]:
path = '/a/bear.cs.xxx.edu./disk/bear-b/users/xxww/PolitiX/Dataset/'
df_clean_tweet = pd.read_csv(path + 'PolitiX_Final_Tweet_Dataset.csv',  dtype = 'str', lineterminator = '\n')

In [9]:
df = df_clean_tweet
df['full_text'] = df['full_text'].fillna(df['text'])

In [10]:
df['is_Retweet'] = df['text'].str.startswith("RT @")

In [11]:
df_tweet = df.loc[df.is_Retweet == False]

In [12]:
df_tweet = df_tweet.reset_index(drop = True)

In [13]:
def preprocess_text(df):
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace("&", "", regex=False)
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace(r"\bRT\b", "", regex=True)
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace(r"&amp;", "", regex=True)
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace("\n", " ", regex=False)
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace("#", "", regex=False)
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace("*", "", regex=False)
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace("¶", "", regex=False)
    df.loc[:, 'clean_tweet'] = df['clean_tweet'].str.replace(r'[:;.,!&\-_$/?\'‘’%★“”"]', "", regex=True)

    return df

def remove_emoji(string):
    # Use the 'emoji' library to remove emojis
    clean_string = emoji.replace_emoji(string, replace='')  # Replaces emojis with empty string
    
    # Return the cleaned string
    return clean_string.strip()  # Remove any leading/trailing spaces


#Message Clean Function
def msg_clean(msg):
    #Remove URL
    msg = re.sub(r'https?://\S+|www\.\S+', " ", msg)

    #Remove Mentions
    msg = re.sub(r'@\w+',' ',msg)

    #Remove HTML tags
    msg = re.sub('r<.*?>',' ', msg)
    msg = re.sub(r'\d+', ' ', msg)
    
    #Remove Emoji from text
    msg = remove_emoji(msg)


    return msg

In [18]:
df_tweet['tweet_text'] = df_tweet['full_text']
df_tweet.loc[:, 'clean_tweet'] = df_tweet['tweet_text'].astype(str).apply(lambda x: msg_clean(x))
df_tweet = preprocess_text(df_tweet)
df_tweet.loc[:, 'clean_tweet'] = df_tweet['clean_tweet'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [19]:
empty_spaces = df_tweet['clean_tweet'].str.strip().eq("")
df_tweet = df_tweet[~empty_spaces].reset_index(drop = True)
df_tweet = df_tweet.reset_index(drop = True)

In [34]:
df_small = df_tweet[['Tweet_ID','User_ID', 'clean_tweet']]

In [22]:
# Function to compute sentiment probabilities
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Convert logits to probabilities
    logits = outputs.logits
    probs = F.softmax(logits, dim=-1).squeeze().cpu().numpy()

    # Get predicted label
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    return probs[0], probs[1], probs[2], labels[predicted_class]

In [29]:
import time
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Start timer
start_time = time.time()

# Sentiment labels
labels = ["Negative", "Neutral", "Positive"]

# Increase batch size to maximize GPU utilization
BATCH_SIZE = 256  # Adjust based on your GPU memory

# Specify device (GPU 3)
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()  # Set model to evaluation mode

# Custom Dataset to tokenize on-the-fly
class TweetDataset(Dataset):
    def __init__(self, tweets, tokenizer, max_length=256):
        self.tweets = tweets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.tweets[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {key: val.squeeze(0) for key, val in encodings.items()}

# Create dataset & dataloader
dataset = TweetDataset(df_small["clean_tweet"].tolist(), tokenizer)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

# Run inference in large batches with mixed-precision
all_probs = []
all_preds = []

with torch.no_grad():
    for batch in dataloader:
        batch = {key: val.to(device, non_blocking=True) for key, val in batch.items()}

        with torch.cuda.amp.autocast():
            outputs = model(**batch)
            logits = outputs.logits

        probs = F.softmax(logits, dim=-1).cpu().numpy()
        preds = torch.argmax(logits, dim=-1).cpu().numpy()

        all_probs.append(probs)
        all_preds.append(preds)

# Convert list of numpy arrays to a single numpy array
all_probs = np.vstack(all_probs)
all_preds = np.concatenate(all_preds)

# Create final DataFrame
df_sentiment = pd.DataFrame(all_probs, columns=['Negative', 'Neutral', 'Positive'])
df_sentiment["Predicted_Sentiment"] = np.array(labels)[all_preds]
df_sentiment.insert(0, "Tweet_ID", df_small["Tweet_ID"].values)

# Display result
print(df_sentiment.head())

# End timer
end_time = time.time()
print(f"Execution Time: {end_time - start_time:.4f} seconds")


              Tweet_ID  Negative   Neutral  Positive Predicted_Sentiment
0  1829797265059324134  0.006210  0.967773  0.026199             Neutral
1  1829708822392492046  0.036774  0.478027  0.485352            Positive
2  1829395023932105078  0.196289  0.457275  0.346436             Neutral
3  1829364778994680274  0.031281  0.418701  0.549805            Positive
4  1829339189969563784  0.005066  0.930176  0.064697             Neutral
Execution Time: 1671.4514 seconds


In [33]:
df_sentiment.to_csv('/a/bear.cs.xxx.edu./disk/bear-b/users/xxww/PolitiX/Dataset/All_Tweet_Sentiment.csv', index = False)

In [36]:
df_small.to_csv('/a/bear.cs.xxx.edu./disk/bear-b/users/xxww/PolitiX/Dataset/All_Tweet_Clean_Text.csv', index = False)

In [37]:
#df_small