In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai as genai
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def preprocess_twitter_data(df):
    df = df.copy()

    # remove duplicates
    df.drop_duplicates(subset=['tweets'], inplace=True)

    # Drop empty tweets (after stripping)
    df['tweets'] = df['tweets'].str.strip()
    df = df[df['tweets'] != ""]

    # Convert tweets and sentiment to lowercase and strip whitespace
    df['tweets'] = df['tweets'].str.strip().str.lower()
    df['sentiment'] = df['sentiment'].str.strip().str.lower()

    # Drop irrelevant tweets
    df = df[df['sentiment'] != 'irrelevant']

    # Remove URLs and usernames from tweets
    df['tweets'] = df['tweets'].str.replace(r'http\S+|www\S+|@\S+', '', regex=True)

    return df

In [None]:
df = pd.read_csv('twitter_training.csv')
raw_df = pd.read_csv('twitter_training.csv')
len(df)

In [None]:
df = preprocess_twitter_data(df)

In [None]:
len(df)

In [None]:
plt.figure(figsize=(6, 3))
df['sentiment'].value_counts().plot(kind='bar')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Sentiment Distribution in Twitter Dataset')
plt.show()

In [None]:
load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [None]:
def get_sentiment(tweet, entity, no_of_shots=0):
    prompt_zero_shot = f"""Classify the sentiment of the following tweet as positive, negative, or neutral.\nGive only the sentiment as single word output in lowercase. This is a strict requirement.\nIf your output is mixed, output neutral.\nTarget Sentence: "{tweet}"\nSentiment:"""
    
    few_shot_examples = """Use the following tweets as example for few shot learning. Each example is followed by a sentiment label."""

    isFewShot = no_of_shots > 0
    if no_of_shots:
        while no_of_shots:
            positive_sample = df[(df['sentiment'] == 'positive') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            while positive_sample == tweet:
                positive_sample = df[(df['sentiment'] == 'positive') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            negative_sample = df[(df['sentiment'] == 'negative') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            while negative_sample == tweet:
                negative_sample = df[(df['sentiment'] == 'negative') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            neutral_sample = df[(df['sentiment'] == 'neutral') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            while neutral_sample == tweet:
                neutral_sample = df[(df['sentiment'] == 'neutral') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            
            few_shot_examples += """\nExample: {}\nSentiment: {}""".format(positive_sample, 'positive')
            few_shot_examples += """\nExample: {}\nSentiment: {}""".format(negative_sample, 'negative')
            few_shot_examples += """\nExample: {}\nSentiment: {}""".format(neutral_sample, 'neutral')
            no_of_shots -= 1
    
    prompt_few_shot = few_shot_examples + "\n"+prompt_zero_shot
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": "You are a sentiment analysis expert."},
                  {"role": "user", "content": prompt_few_shot if isFewShot else prompt_zero_shot}],
        temperature=0
    )

    sentiment = response.choices[0].message.content.strip()

    isPositive =  'positive' in sentiment.lower()
    isNegative = "negative" in sentiment.lower()
    isNeutral = "neutral" in sentiment.lower()

    if not isPositive and not isNegative and not isNeutral:
        if 'mixed' in sentiment:
            sentiment = 'neutral'
        else:
            if ":" in sentiment:
                sentiment = sentiment.split(":")[1].strip()
            if "." in sentiment:
                sentiment = sentiment.split(".")[0].strip()
            if not sentiment:
                sentiment = 'invalid'
    # if more than one word in sentiment or sentiment is not in ['positive', 'negative', 'neutral'], mark as invalid
    if (isPositive and isNegative) or (isPositive and isNeutral) or (isNegative and isNeutral):
        sentiment = 'invalid'
    return sentiment.lower()

In [None]:
df_sample = pd.read_csv('twitter_training.csv') # replace this actual batched file

In [None]:
# zero shot gpt
zero_shot_predictions = [get_sentiment(tweet, entity) for tweet, entity in zip(df_sample['tweets'], df_sample['entity'])]
y_true = df_sample['sentiment'].str.lower()
zero_shot_accuracy = accuracy_score(y_true, zero_shot_predictions)
print("\nZero-Shot Classification Report:\n", classification_report(y_true, zero_shot_predictions, digits=4))
print(confusion_matrix(y_true, zero_shot_predictions))

In [None]:
# 1 shot gpt
one_shot_predictions = [get_sentiment(tweet, entity, no_of_shots=1) for tweet, entity in zip(df_sample['tweets'], df_sample['entity'])]
one_shot_accuracy = accuracy_score(y_true, one_shot_predictions)
print("One-Shot Accuracy:", one_shot_accuracy)
print("\nOne-Shot Classification Report:\n", classification_report(y_true, one_shot_predictions, digits=4))
print(confusion_matrix(y_true, one_shot_predictions))

In [None]:
# 3 shot gpt
three_shot_predictions = [get_sentiment(tweet, entity, no_of_shots=3) for tweet, entity in zip(df_sample['tweets'], df_sample['entity'])]
three_shot_accuracy = accuracy_score(y_true, three_shot_predictions)
print("Few-Shot Accuracy:", three_shot_accuracy)
print("\nThree-Shot Classification Report:\n", classification_report(y_true, three_shot_predictions, digits=4))
print(confusion_matrix(y_true, three_shot_predictions))

In [None]:
load_dotenv()
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

In [None]:
def get_sentiment_gemini(tweet, entity, no_of_shots=0):
    prompt_zero_shot = f"""Classify the sentiment of the following tweet as positive, negative, or neutral.
    Give only the sentiment as a single lowercase word â€” "positive", "negative", or "neutral". This is a strict requirement.
    If the sentiment is unclear or mixed, output "neutral".
    Target Sentence: "{tweet}"
    Sentiment:"""

    few_shot_examples = "Use the following tweets as examples for few-shot learning. Each example is followed by a sentiment label:\n"

    isFewShot = no_of_shots > 0
    if isFewShot:
        for _ in range(no_of_shots):
            positive_sample = df[(df['sentiment'] == 'positive') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            while positive_sample == tweet:
                positive_sample = df[(df['sentiment'] == 'positive') & (df['entity'] == entity)]['tweets'].sample(1).values[0]

            negative_sample = df[(df['sentiment'] == 'negative') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            while negative_sample == tweet:
                negative_sample = df[(df['sentiment'] == 'negative') & (df['entity'] == entity)]['tweets'].sample(1).values[0]

            neutral_sample = df[(df['sentiment'] == 'neutral') & (df['entity'] == entity)]['tweets'].sample(1).values[0]
            while neutral_sample == tweet:
                neutral_sample = df[(df['sentiment'] == 'neutral') & (df['entity'] == entity)]['tweets'].sample(1).values[0]

            few_shot_examples += f"\nExample: {positive_sample}\nSentiment: positive"
            few_shot_examples += f"\nExample: {negative_sample}\nSentiment: negative"
            few_shot_examples += f"\nExample: {neutral_sample}\nSentiment: neutral"

    prompt_few_shot = few_shot_examples + "\n"+prompt_zero_shot
    print("prompting", prompt_few_shot if isFewShot else prompt_zero_shot)
    model = genai.GenerativeModel(model_name="models/gemini-1.5-pro")
    response = model.generate_content(prompt_few_shot if isFewShot else prompt_zero_shot)
    print("received response")
    sentiment = response.text.strip().lower()  

    isPositive = "positive" in sentiment
    isNegative = "negative" in sentiment
    isNeutral = "neutral" in sentiment

    if not isPositive and not isNegative and not isNeutral:
        if 'mixed' in sentiment:
            sentiment = 'neutral'
        else:
            if ":" in sentiment:
                sentiment = sentiment.split(":")[1].strip()
            if "." in sentiment:
                sentiment = sentiment.split(".")[0].strip()
            if not sentiment:
                sentiment = 'invalid'

    if (isPositive and isNegative) or (isPositive and isNeutral) or (isNegative and isNeutral):
        sentiment = 'invalid'
    print(tweet, sentiment)
    return sentiment


In [None]:
# zero shot gemini
zero_shot_predictions = [get_sentiment_gemini(tweet, entity) for tweet, entity in zip(df_sample['tweets'], df_sample['entity'])]
y_true = df_sample['sentiment'].str.lower()
zero_shot_accuracy = accuracy_score(y_true, zero_shot_predictions)
print(confusion_matrix(y_true, zero_shot_predictions))

In [None]:
# 1 shot gemini
one_shot_predictions = [get_sentiment_gemini(tweet, entity, no_of_shots=1) for tweet, entity in zip(df_sample['tweets'], df_sample['entity'])]
one_shot_accuracy = accuracy_score(y_true, one_shot_predictions)
print("One-Shot Accuracy:", one_shot_accuracy)
print("\nOne-Shot Classification Report:\n", classification_report(y_true, one_shot_predictions))
print(confusion_matrix(y_true, one_shot_predictions))

In [None]:
# 3 shot gemini
three_shot_predictions = [get_sentiment_gemini(tweet, entity, no_of_shots=3) for tweet, entity in zip(df_sample['tweets'], df_sample['entity'])]
three_shot_accuracy = accuracy_score(y_true, three_shot_predictions)
print("Few-Shot Accuracy:", three_shot_accuracy)
print("\nThree-Shot Classification Report:\n", classification_report(y_true, three_shot_predictions, digits=4))
print(confusion_matrix(y_true, three_shot_predictions))