1. Load CSV with tweets (or Instagram posts/TikTok comments, etc)

In [None]:
import pandas as pd
tweets = pd.read_csv("Bellingcat 2023.csv")

2. Filter out quotes and replies (only for Twitter data, you may need another filtering for another sources)

In [None]:
tweets = tweets[tweets["type"] == "Post"]

3. Detect language of each tweet

In [None]:
from langdetect import detect, LangDetectException

def detect_no_fail(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

tweets["lang"] = tweets.apply(lambda x: detect_no_fail(x["text"]), axis=1)

4. Filter out non english tweets

In [None]:
tweets = tweets[tweets["lang"] == "en"]

5. Vectorize each tweet 

In [None]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-large')
instructions = [
    ['Represent the tweet for clustering: ', tweet]
    for tweet in tweets["text"].to_list()
]
embeddings = model.encode(instructions)

6. Use MiniBatchKMeans algorythm to claterize tweets

In [None]:
import sklearn.cluster
clustering_model = sklearn.cluster.MiniBatchKMeans(n_clusters=15)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

7. Find 10 most representative (10 nearest to cluster center) tweets fir each cluster

In [None]:
import numpy as np

# Assuming 'embeddings' is the array of tweet embeddings and 'clustering_model' is your MiniBatchKMeans model
centroids = clustering_model.cluster_centers_

representative_tweets = {}
for cluster_num in range(15):
    # Get the indices of tweets in this cluster
    indices = np.where(cluster_assignment == cluster_num)[0]
    
    # Calculate distances of tweets in this cluster to the centroid
    distances = np.linalg.norm(embeddings[indices] - centroids[cluster_num], axis=1)
    
    # Get indices of tweets with the shortest distances
    representative_idx = np.argsort(distances)[:10]  # Adjust the number 10 as needed
    
    # Store the indices or the tweets themselves
    representative_tweets[cluster_num] = indices[representative_idx]

# Now 'representative_tweets' holds the indices of the most representative tweets for each cluster

8. Use GPT-4-Turbo API to generate topics using respresentative tweets

In [None]:
import openai
import time
from tqdm import tqdm

def get_cluster_name(tweets):
    GENERAL_TOPIC = "Bellingcat"

    openai.api_key = "" # Your API key here
    
    prompt = f"Review the tweets contained in the previous messages. Identify and provide a single, overarching topic that unites these tweets, using a maximum of three words. Ensure the response contains only this topic, with no additional comments or information. If no common topic can be determined, respond with 'Undefined'. General topic of this tweets is `{GENERAL_TOPIC}`, don't use it, try to find more specific topic"
    messages = [{"role": "system", "content": prompt}]
    messages.extend([{"role": "user", "content": tweet} for tweet in tweets])
    messages.append({"role": "user", "content": prompt})

    try: 
        response = openai.ChatCompletion.create(model="gpt-4-1106-preview", messages=messages)
    except openai.error.RateLimitError:
        time.sleep(60)
        response = openai.ChatCompletion.create(model="gpt-4-1106-preview", messages=messages)

    return response.choices[0].message['content']

for cluster in tqdm(tweets["cluster"].unique()):
    representative_ids = representative_tweets[cluster]
    representative_tweets_texts = [
        tweets["text"].iloc[i]
        for i in representative_ids
    ]
    
    cluster_name = get_cluster_name(representative_tweets_texts)
    tweets.loc[tweets["cluster"] == cluster, "cluster_name"] = cluster_name

9. Estimate sentiment of each tweet

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_tweet(tweet):
    return tweet

def analyze_sentiment(tweet):
    inputs = tokenizer(tweet, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    scores = outputs.logits
    return torch.softmax(scores, dim=1).detach().numpy()[0]

def scale_score(probabilities):
    # Assuming probabilities order: [Negative, Neutral, Positive]
    neg, neu, pos = probabilities
    # Weighted score: Negative scores contribute negatively, positive scores contribute positively
    scaled_score = neg * (-10) + neu * 0 + pos * 10
    return scaled_score


# Example usage
def get_sentiment(tweet):
    preprocessed_tweet = preprocess_tweet(tweet)
    model_score = analyze_sentiment(preprocessed_tweet)
    scaled_score = scale_score(model_score)
    return scaled_score

tweets["sentiment"] = tweets.apply(lambda x: get_sentiment(x["text"]), axis=1)