In [None]:
# Install necessary libraries (run in notebook cell)
!pip install transformers
!pip install --upgrade datasets


In [None]:


# Standard data libs
import numpy as np
import pandas as pd
import os
import re




In [None]:
# Hugging Face Transformers & Datasets
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

from datasets import load_dataset

# Load CSV into a Hugging Face Dataset
dataset = load_dataset('csv', data_files='NFL_reddit_data_2021.csv', split='train')
# Convert to pandas DataFrame (if needed)
df_hf = dataset.to_pandas()

# Display first few rows
df_hf.head()


In [None]:
from transformers import AutoTokenizer

# Load a pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def hf_normalize(texts, max_length=128):
    """
    Tokenizes raw text with built-in normalization,
    returns token IDs and attention masks ready for model input.
    """
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'  # for PyTorch; use tf for TensorFlow
    )

# Example usage
sample_texts = [
    "I love NLP! It's fun — even with emojis 😊",
    "NLTK used to require stopword filtering, lemmatization, etc."
]
encoded = hf_normalize(sample_texts)
print(encoded.input_ids.shape)  # -> (2, 128)

In [None]:
batch = df_hf["text"].astype(str).tolist()
encoded_batch = hf_normalize(batch)

In [None]:
# Optional: custom cleanup for noisy text
def clean_text(text):
    # Example: remove HTML tags or URLs
    return re.sub(r'https?://\S+|<.*?>', '', text)

texts = df_hf["text"].astype(str).apply(clean_text).tolist()

In [None]:
# As a dictionary
input_ids = encoded_batch['input_ids']         # -> tensor of shape (batch_size, max_length)
attention_mask = encoded_batch['attention_mask']

# Or as attributes
input_ids = encoded_batch.input_ids
attention_mask = encoded_batch.attention_mask

# Example: Check shapes
print(input_ids.shape)         # e.g., torch.Size([32, 128])
print(attention_mask.shape)    # same shape


In [None]:
first_ids = input_ids[0]       # 1D tensor of length max_length
print(first_ids)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(first_ids)
print(tokens[:20])

In [None]:
first_ids_list = first_ids.tolist()       # now a regular Python list of ints
print(first_ids_list)

In [None]:
from transformers import pipeline

# Create a sentiment analysis pipeline
sentiment_pipe = pipeline("sentiment-analysis")


In [None]:
# Run inference on all texts
results = sentiment_pipe(df_hf["text"].astype(str).tolist(), batch_size=16)

# Extract sentiment labels
labels = [res["label"] for res in results]

# Map to numeric scores: POSITIVE → +1, NEGATIVE → −1
scores = [1 if label == "POSITIVE" else -1 for label in labels]

# Compute metrics
overall_sentiment_score = sum(scores)
percent_positive = (labels.count("POSITIVE") / len(labels)) * 100

# Put into DataFrame
metrics_df = pd.DataFrame({
    "overall_sentiment_score": [overall_sentiment_score],
    "percent_positive": [percent_positive]
})
metrics_df


In [None]:
# from transformers import pipeline

# # 1. Initialize sentiment pipeline
# # sentiment_pipe = pipeline("sentiment-analysis")

# # 2. Batch inference on your cleaned text column
# texts = df_hf["text"].astype(str).tolist()
# results = sentiment_pipe(texts, batch_size=16)

# # 3. Extract labels and compute numeric scores
# labels = [res["label"] for res in results]
# scores = [1 if label == "POSITIVE" else (-1 if label == "NEGATIVE" else 0) for label in labels]

# # 4. Assign sentiments back to dataframe and compute metrics
# df_hf["sentiment"] = labels
# overall_sentiment_score = sum(scores)
# percent_positive = labels.count("POSITIVE") / len(labels) * 100

# # 5. Display subset and save results
# print(df_hf[["player", "text", "sentiment"]].head(10))

# df_hf.to_csv("/content/NFL_reddit_sentiment_analysis.csv", index=False)
# print("Sentiment Analysis CSV saved at: /content/NFL_reddit_sentiment_analysis.csv")


In [None]:
# # 4. Map labels to scores: +1 for Positive, 0 Neutral, -1 Negative
# score_map = {"Positive": 1, "Negative": -1, "Neutral": 0}
# df_hf["sentiment_score"] = df_hf["sentiment"].map(score_map)

# # Correct groupby & aggregation using list of columns
# player_sentiment = (
#     df_hf.groupby("player")[["sentiment", "sentiment_score"]]
#       .agg(
#         sentiment_count=("sentiment", "count"),
#         positive_count=("sentiment", lambda x: (x=="Positive").sum()),
#         negative_count=("sentiment", lambda x: (x=="Negative").sum()),
#         neutral_count=("sentiment", lambda x: (x=="Neutral").sum()),
#         overall_sentiment_score=("sentiment_score", "mean")
#       )
# )

# # Calculate percent positive
# player_sentiment["percent_positive"] = (
#     player_sentiment["positive_count"] / player_sentiment["sentiment_count"] * 100
# )

# # Save and inspect
# player_sentiment.to_csv("/content/player_sentiment_results.csv")
# player_sentiment.head(20)

In [None]:
# print(set(labels))
# # e.g., {'Positive', 'Negative', 'neutral'}


In [None]:
# from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# tokenizer = AutoTokenizer.from_pretrained(MODEL)
# model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# sentiment_pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, batch_size=16)

# results = sentiment_pipe(df_hf["text"].astype(str).tolist())
# labels = [res["label"] for res in results]

# # Normalize casing to lowercase
# labels = [label.lower() for label in labels]  # e.g., 'negative', 'neutral', 'positive'

# score_map = {"negative": -1, "neutral": 0, "positive": 1}
# scores = [score_map[label] for label in labels]

# df_hf["sentiment"] = labels
# df_hf["sentiment_score"] = scores



In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Load 3‑class sentiment model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    batch_size=16
)

# Run sentiment inference
results = sentiment_pipe(df_hf["text"].astype(str).tolist())

# Normalize casing and map to scores
labels = [res["label"].lower() for res in results]  # 'negative', 'neutral', 'positive'
score_map = {"negative": -1, "neutral": 0, "positive": 1}
scores = [score_map[label] for label in labels]

# Assign back to dataframe
df_hf["sentiment"] = labels
df_hf["sentiment_score"] = scores


In [None]:
# player_sentiment = df_hf.groupby("player")["sentiment", "sentiment_score"] \
#     .agg(sentiment_count=("sentiment", "count"),
#          positive_count=("sentiment", lambda x: (x=="POSITIVE").sum()),
#          neutral_count=("sentiment", lambda x: (x=="NEUTRAL").sum()),
#          negative_count=("sentiment", lambda x: (x=="NEGATIVE").sum()),
#          overall_sentiment_score=("sentiment_score", "mean"))

# Correct groupby & aggregation using list of columns
player_sentiment = (
    df_hf.groupby("player")[["sentiment", "sentiment_score"]]
      .agg(
        sentiment_count=("sentiment", "count"),
        positive_count=("sentiment", lambda x: (x=="positive").sum()),
        negative_count=("sentiment", lambda x: (x=="negative").sum()),
        neutral_count=("sentiment", lambda x: (x=="neutral").sum()),
        overall_sentiment_score=("sentiment_score", "mean")
      )
)

player_sentiment["percent_positive"] = (
    player_sentiment["positive_count"] / player_sentiment["sentiment_count"]
) * 100

player_sentiment.to_csv("/content/player_sentiment_results_3class.csv")



In [None]:
player_sentiment.head(20)

In [None]:
# Ensure your player_sentiment DataFrame exists with these columns:
# - sentiment_count
# - positive_count
# - negative_count
# - neutral_count
# - overall_sentiment_score
# - percent_positive

# Step 1: Sort players by highest overall sentiment score
top_25_players = player_sentiment.sort_values("overall_sentiment_score", ascending=False).head(25)

# Step 2: Display the results
print("Top 25 Players Based on Overall Sentiment Score")
print(top_25_players)

# Optional (for better visibility in notebook or Colab)
import IPython.display as disp
disp.display(top_25_players)
