In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import glob
import requests
import re
from bs4 import BeautifulSoup
import seaborn as sns
import nltk
plt.style.use('ggplot')

# API Request

In [None]:
base_url = "https://mastodon.social"
access_token = "Your access token"

def search_hashtag(hashtag, limit=40, max_id=None):
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    endpoint = f"{base_url}/api/v1/timelines/tag/{hashtag}?limit={limit}"

    if max_id:
        endpoint += f"&max_id={max_id}"

    response = requests.get(endpoint, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return None

In [None]:
def retrieve_hashtag_posts(hashtag, max_posts=10000):
    data = []
    max_id = None

    while len(data) < max_posts:
        limit = min(40, max_posts - len(data))
        hashtag_posts = search_hashtag(hashtag, limit=limit, max_id=max_id)

        if not hashtag_posts:
            break

        data.extend([
            {
                "Content": post["content"],
                "Author": post["account"]["username"],
                "Date": post["created_at"],
            }
            for post in hashtag_posts
        ])

        if len(hashtag_posts) < 40:
            break

        max_id = hashtag_posts[-1]["id"]

    return data

hashtag = "Your #"
max_posts = 10000

data = retrieve_hashtag_posts(hashtag, max_posts)
df = pd.DataFrame(data)

# Preprocessing

In [None]:
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove specific pattern
    text = re.sub(r'class="ellipsis">.*?</a>', '', text)
    
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text()
    
    # Remove special characters and extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    return cleaned_text

# Apply preprocessing to the "Content" column
df["Cleaned_Content"] = df["Content"].apply(preprocess_text)

# Display the cleaned content of the first tweet
print("Cleaned content of the first tweet:")
print(df.loc[0, "Cleaned_Content"])


In [None]:
csv_filename = "Your#_posts.csv"
df.to_csv(csv_filename, index=False)

# Sentiment Analysis

In [None]:
df = pd.read_csv("..Your#_posts.csv")

**NLTK**

In [None]:
example = df['Cleaned_Content'][42]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:15]
tagged = nltk.pos_tag(tokens)
tagged[:15]
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

**Vaders**

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores(example)

In [None]:
def process_sentiment_analysis(df):
    
    df.insert(0, 'Id', range(1, 1 + len(df)))
    
    sia = SentimentIntensityAnalyzer()

    res = {}
    for i, row in tqdm(df.iterrows(), total=len(df)):
        text = row['Cleaned_Content']
        myid = row['Id']
        res[myid] = sia.polarity_scores(text)

    vaders = pd.DataFrame(res).T
    vaders = vaders.reset_index().rename(columns={'index': 'Id'})
    vaders = vaders.merge(df, how='left')
    return vaders

In [None]:
vaders_df = process_sentiment_analysis(df)

**Visualization**

In [None]:
def plot_sentiment_by_year(df, year, name):
    # Convert the "Date" column to datetime
    df['Date'] = pd.to_datetime(df['Date'])

    # Filter data for the specified year
    df_year = df[df['Date'].dt.year == year]

    if df_year.empty:
        print(f"No data available for the year {year}.")
        return

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.scatter(df_year['Date'], df_year['compound'], color='b', marker='o')
    plt.title(f'Sentiment Analysis of {name}-Posts in {year}')
    plt.xlabel('Date')
    plt.ylabel('Compound Sentiment Score')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()

    # Display the plot
    plt.show()

In [None]:
vaders_df_2023 = plot_sentiment_by_year(vaders_df, year=2023, name="Your#")

In [None]:
def add_sentiment_intensity_column(dataframe):
    conditions = [
        (dataframe['compound'] > 0.5),
        (dataframe['compound'] > 0.1),
        (dataframe['compound'] > -0.1),
        (dataframe['compound'] > -0.5)
    ]
    
    choices = [2, 1, 0, -1]
    
    dataframe['sent_int_score'] = np.select(conditions, choices, default=-2)

In [None]:
vaders_df_sentint = add_sentiment_intensity_column(vaders_df)

In [None]:
def plot_sentiment_intensity(dataframe, title_name):
    # Calculate the average sentiment score
    avg_sentiment = dataframe['compound'].mean()
    
    # Map sentiment intensity scores to labels
    sentiment_labels = {
        -2: "Highly Negative",
        -1: "Negative",
        0: "Neutral",
        1: "Positive",
        2: "Highly Positive"
    }
    
    # Group data by sentiment intensity score and count occurrences
    sentiment_counts = dataframe['sent_int_score'].value_counts().sort_index()
    
    # Plotting
    plt.figure(figsize=(8, 5))
    plt.bar(sentiment_labels.values(), sentiment_counts, color='blue')
    plt.title('Sentiment Intensity Distribution of {}-Posts'.format(title_name))
    plt.xlabel('Sentiment Intensity')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()

    plt.show()
    
    print("Average Compound Sentiment Score:", avg_sentiment)
    print("Total Posts:", len(dataframe))

In [None]:
plot_sentiment_intensity(vaders_df_sentint, "Your#")