# Twitter Sentiment Classification: Positive vs. Negative

In [1]:
import pandas as pd

df_train = pd.read_csv('../data/twitter_sentiment_train.csv')
df_test  = pd.read_csv('../data/twitter_sentiment_test.csv')

In [2]:
int_to_label = {1: 'Positive', 0: 'Negative'}

In [3]:
df_train.head(5)

Unnamed: 0,text,label
0,"""If Paul Dunne wins the Open tomorrow as an am...",1
1,Spreading the word about our newest twilight m...,1
2,Tom Brady playing on Thursday makes the nfl se...,1
3,@user kris bryant is the 3rd best defensive 3b...,1
4,"it may be beyonce's bday, but we must not forg...",1


### Import libraries

In [4]:
import html
import re
import string
from twokenize import twokenize
from textblob import TextBlob

# Monkey patch the broken function
def normalizeTextForTagger(text):
    text = text.replace("&amp;", "&")
    text = html.unescape(text)  # Use html.unescape instead
    return text

twokenize.normalizeTextForTagger = normalizeTextForTagger
from pre_processing import *

### Feature Extraction Checklist

1. Profanity words count
2. Sentiment and Subjectivity 
3. Emoji Sentiment + Emoticon e.g :), ðŸ˜‚, :((
3. Fully Capitalized
4. Punctuations

In [None]:
def count_all_capital_tokens(text: str) -> dict:
    """
    Counts the number of fully capitalized tokens (all letters uppercase) in a given text.
    Returns: {'all_capital_token_count': count}
    """
    matches = re.findall(r'\b[A-Z][A-Z]+\b', text)
    return {'all_capital_token_count': len(matches)}

def count_punctuation(text: str) -> dict:
    """
    Counts the occurrences of each punctuation mark in a given text.
    Returns: {'punctuation_char1': count1, 'punctuation_char2': count2, ...}
    """
    punct_occur = {}
    for char in string.punctuation:
        punct_occur[char] = 0
    for char in text:
        if char in string.punctuation:
            punct_occur[char] += 1
    return punct_occur

def count_profanity_words(text: str, profanity_list: list) -> dict:
    """
    Counts the number of profanity words in a given text using a predefined list.
    Returns: {'profanity_word_count': count}
    """
    count = 0
    # Normalize both input and the word list using the to_lower() function
    profanity_list = [s.lower() for s in profanity_list]
    tokenized_sent = run_pipeline(text, [word_tokenize_sentence, to_lower])
    for sent in tokenized_sent:
        for token in sent:
            if token in profanity_list:
                count += 1
    return {'profanity_word_count': count}

# TextBlob does not work on emojis !!!!
def get_sentiment_and_subjectivity(text: str) -> dict:
    """
    Returns the sentiment polarity and subjectivity scores of a given text using TextBlob.
    Returns: {
      "positive_sentiment": polarity if > 0, else 0,
      "negative_sentiment": |polarity| if < 0, else 0,
      "subjectivity": score
    }
    """
    blob = TextBlob(text)
    pol = blob.sentiment.polarity
    subj = blob.sentiment.subjectivity
    
    return {
        "positive_sentiment": pol if pol > 0 else 0,
        "negative_sentiment": abs(pol) if pol < 0 else 0,
        "subjectivity": subj
    }

In [21]:
text = "ðŸ˜‚"
print(get_sentiment_and_subjectivity(text))

{'positive_sentiment': 0, 'negative_sentiment': 0, 'subjectivity': 0.0}
