# Twitter Sentiment Classification: Positive vs. Negative

In [30]:
import pandas as pd

df_train = pd.read_csv('../data/twitter_sentiment_train.csv')
df_test  = pd.read_csv('../data/twitter_sentiment_test.csv')

# Shuffle train set
RANDOM_STATE = 123
df_train = df_train.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

In [31]:
int_to_label = {1: 'Positive', 0: 'Negative'}

In [32]:
df_train.head(5)

Unnamed: 0,text,label
0,Batman the #killing joke 1st printing nm 9.4 #...,1
1,the sun is so stupid they know how much the bo...,0
2,@user trying to do the same ha. But I've been ...,1
3,"""Prince George turns 2: His best style moments...",1
4,Hulk Hogan hit rock bottom Friday when he was ...,0


### Import libraries

In [33]:
import re
import string
from tqdm import tqdm
from textblob import TextBlob
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from pre_processing import *

In [34]:
with open('../data/profanity.txt', 'r') as f: 
    profanity_words = f.readlines()
profanity_words = [s.strip() for s in profanity_words]

In [869]:
punct_list = ['!','#','@']

### Feature Extraction Checklist

1. Profanity words count
2. Sentiment and Subjectivity 
3. Emoji Sentiment + Emoticon e.g :), ðŸ˜‚, :((
3. Fully Capitalized
4. Punctuations

In [37]:
def count_all_capital_tokens(text: str) -> dict:
    """
    Counts the number of fully capitalized tokens (all letters uppercase) in a given text.
    Returns: {'all_capital_token_count': count}
    """
    matches = re.findall(r'\b[A-Z][A-Z]+\b', text)
    return {'all_capital_token_count': len(matches)}

def count_specified_punctuations(text: str, punct_list: list) -> dict:
    """
    Counts the occurrences of each punctuation mark in a given text.
    Returns: {'punctuation_char1': count1, 'punctuation_char2': count2, ...}
    """
    punct_occur = {}
    for char in punct_list:
        punct_occur[char] = 0
    for char in text:
        if char in punct_list:
            punct_occur[char] += 1
    return punct_occur

def count_profanity_words(text: str, profanity_list: list) -> dict:
    """
    Counts the number of profanity words in a given text using a predefined list.
    Returns: {'profanity_word_count': count}
    """
    count = 0
    # Normalize both input and the word list using the to_lower() function
    profanity_list = [s.lower() for s in profanity_list]
    tokenized_sent = run_pipeline(text, [word_tokenize_sentence, to_lower])
    for sent in tokenized_sent:
        for token in sent:
            if token in profanity_list:
                count += 1
    return {'profanity_word_count': count}

def count_sad_emoticons(text: str):
    """
    Returns the occurrences of sad emoticons.
    Returns: {
      "sad_emoticon": count
    }
    """
    # Sad, crying, angry, and negative emoticons
    matches = re.findall(r':\(|:\||:\/|:\\|:\'\(|>:\(|D:|:<|:c|;\(|T_T|T\.T', text)
    return {"sad_emoticon": len(matches)}

def count_happy_emoticons(text: str):
    """
    Returns the occurrences of happy emoticons.
    Returns: {
      "happy_emoticon": count
    }
    """
    # Happy, excited, laughing, and positive emoticons
    matches = re.findall(r':\)|:D|;D|=\)|;-\)|:\}\)|:>|=\]|8\)|;-D|XD|xD|x-D|X-D|<3|:\*|;-\*|;\)|=D', text)
    return {"happy_emoticon": len(matches)}

def count_not(text: str):
    matches = re.findall(r'dnt|ont|not', text)
    return {'not_count': len(matches)}

def count_elongated_words(text):
    matches = re.findall(r'\b\w*(\w)\1{2,}\w*\b', text)
    return {'elongated_word_count': len(matches)}

def count_positive_words(text):
    positive_words = ['good', 'happy', 'love', 'great', 'excellent']
    tokens = str(text).lower().split()
    return {'positive_word_count': sum(1 for t in tokens if t in positive_words)}

def count_negative_words(text):
    negative_words = ['bad', 'sad', 'hate', 'terrible', 'awful']
    tokens = str(text).lower().split()
    return {'negative_word_count': sum(1 for t in tokens if t in negative_words)}

def uppercase_ratio(text):
    total_letters = sum(1 for c in text if c.isalpha())
    return {'uppercase_ratio': sum(1 for c in text if c.isupper()) / total_letters} if total_letters else {'uppercase_ratio': 0}

import json
from textblob import TextBlob
import emoji

# Load emoji JSON
with open("../data/emoji_polarity.json", "r", encoding="utf-8") as f:
    emoji_json = json.load(f)

# Load emoticon JSON
with open("../data/unicode_polarity.json", "r", encoding="utf-8") as f:
    emoticon_json = json.load(f)

# Merge both dictionaries
combined_sentiment = {**emoji_json, **emoticon_json}

def get_sentiment_and_subjectivity(text: str) -> dict:
    blob = TextBlob(text)
    pol = blob.sentiment.polarity
    subj = blob.sentiment.subjectivity
    
    tb_pos = pol if pol > 0 else 0
    tb_neg = abs(pol) if pol < 0 else 0

    # Find all emojis and emoticons in text
    items_in_text = [ch for ch in text if ch in emoji.EMOJI_DATA]  # emojis
    # emoticons (like :) ;D) â€” check by splitting text
    for em in combined_sentiment:
        if em in text and em not in items_in_text:
            items_in_text.append(em)

    if items_in_text:
        pos_list = [combined_sentiment[i]["positivity"] for i in items_in_text if i in combined_sentiment]
        neg_list = [combined_sentiment[i]["negativity"] for i in items_in_text if i in combined_sentiment]

        if pos_list:
            avg_pos = sum(pos_list) / len(pos_list)
            avg_neg = sum(neg_list) / len(neg_list)
            final_pos = (tb_pos + avg_pos) / 2
            final_neg = (tb_neg + avg_neg) / 2
        else:
            final_pos, final_neg = tb_pos, tb_neg
    else:
        final_pos, final_neg = tb_pos, tb_neg

    return {
        "positive_sentiment": final_pos,
        "negative_sentiment": final_neg,
        "subjectivity": subj
    }


In [None]:
def uncontract(text):
    text = re.sub(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", r"\1\2 not", text)
    text = re.sub(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text)
    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
    text = re.sub(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", r"\1\2 have", text)
    text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
    text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
    text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
    text = re.sub(r"(\b)([Ii]t)'s", r"\1\2 is", text)
    text = re.sub(r"(\b)([Tt]here)'s", r"\1\2 is", text)
    text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
    text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
    text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)
    return text

def convert_urls_emails(text):
    url_regex_1 = r'^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$'
    url_regex_2 = r'^[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$'
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    text = re.sub(url_regex_1, 'URL', text)
    text = re.sub(url_regex_2, 'URL', text)
    text = re.sub(email_regex, 'EMAIL', text)
    return text

def clean_unicode(text: str):
    """
    Replaces common unicode characters with ASCII equivalents.
    Useful for tweet preprocessing.
    """
    text = re.sub(r'\\u2019', "'", text)
    text = re.sub(r'\\u201c', '"', text)
    text = re.sub(r'\\u201d', '"', text)
    text = re.sub(r'\\u002c', ',', text)
        
    return text

def remove_numbers(text: str):
    return re.sub(r'[0-9]','',text)

def preprocessing_text(text):
    # Word normalization
    text = clean_unicode(text)
    text = remove_numbers(text)
    text = uncontract(text)
    text = convert_urls_emails(text)
    # Classical preprocessing steps
    text = word_tokenize_sentence(text)
    text = to_lower(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

In [None]:
feature_functions = [
    count_punctuation,
    lambda text: count_profanity_words(text, profanity_words),
    count_all_capital_tokens,
    count_sad_emoticons,
    count_happy_emoticons,
    get_sentiment_and_subjectivity
]

# Apply preprocessing to the datasets
clean_tokens_train = [preprocessing_text(t) for t in df_train['text']]
clean_tokens_test = [preprocessing_text(t) for t in df_test['text']]
clean_text_train = [' '.join(tokens) for tokens in clean_tokens_train]
clean_text_test = [' '.join(tokens) for tokens in clean_tokens_test]

def tfidf_features(training_data, test_data, ngram_range, max_features):

    tfidf = TfidfVectorizer(
        ngram_range  = ngram_range,
        max_features = max_features,
        lowercase    = False,
        tokenizer    = None,
        preprocessor = None,
        stop_words   = None,
        min_df       = 10,
        max_df       = 0.80
        )

    tfidf_train = tfidf.fit_transform(training_data)

    tfidf_train = tfidf_train.toarray()
    tfidf_train = pd.DataFrame(tfidf_train)
    tfidf_train.columns = tfidf.get_feature_names_out()

    tfidf_test = tfidf.transform(test_data)

    tfidf_test = tfidf_test.toarray()
    tfidf_test = pd.DataFrame(tfidf_test)
    tfidf_test.columns = tfidf.get_feature_names_out()

    return tfidf_train, tfidf_test, tfidf

tfidf_train, tfidf_test, vectorizer = tfidf_features(clean_text_train, clean_text_test, (1,2), 1000)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:05<00:00,  1.09it/s]


In [None]:
X_train.head(5)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7/7 [00:05<00:00,  1.26it/s]


Unnamed: 0,able,absolutely,ac,ac dc,act,actually,afternoon,agree,ahead,ai,...,#,@,profanity_word_count,all_capital_token_count,not_count,sad_emoticon,happy_emoticon,positive_sentiment,negative_sentiment,subjectivity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,0,0,1,0,0,0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.367361,0.571528
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0.35,0.0,0.3625
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1.0,0.0,0.3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0.0,0.0,0.0


In [None]:
X_train_features = X_train.drop(columns=['text'])
X_test_features = X_test.drop(columns=['text'])

X_train = df_train[['text']].copy()
X_test = df_test[['text']].copy()

for func in tqdm(feature_functions):
    results = X_train['text'].apply(lambda x: func(str(x))).tolist()
    temp_df = pd.DataFrame(results)
    
    temp_df.reset_index(drop=True, inplace=True)
    X_train.reset_index(drop=True, inplace=True)
    X_train = pd.concat([X_train, temp_df], axis=1)
    
    results = X_test['text'].apply(lambda x: func(str(x))).tolist()
    temp_df = pd.DataFrame(results)
    
    temp_df.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    X_test = pd.concat([X_test, temp_df], axis=1)

# Drop the text column from custom features
X_train_custom = X_train.drop(columns=['text'])
X_test_custom = X_test.drop(columns=['text'])

# Concatenate TF-IDF + Custom Features
tfidf_train.reset_index(drop=True, inplace=True)
tfidf_test.reset_index(drop=True, inplace=True)
X_train_custom.reset_index(drop=True, inplace=True)
X_test_custom.reset_index(drop=True, inplace=True)

X_train_combined = pd.concat([tfidf_train, X_train_custom], axis=1)
X_test_combined = pd.concat([tfidf_test, X_test_custom], axis=1)

# Convert column names to strings before scaling
X_train_combined.columns = X_train_combined.columns.astype(str)
X_test_combined.columns = X_test_combined.columns.astype(str)

X_train_final = X_train_combined
X_test_final = X_test_combined

X_train_final.head(5)

# # Normalize all features together 
# scaler = StandardScaler()
# X_train_final = scaler.fit_transform(X_train_combined)
# X_test_final = scaler.transform(X_test_combined)

# print(X_train_final)

  y = column_or_1d(y, warn=True)


In [None]:
y_train = df_train['label']
y_test = df_test['label']

model = SGDClassifier(
    loss='log_loss',
    learning_rate='adaptive',
    max_iter=1000,
    eta0=0.01,
    random_state=RANDOM_STATE
)

model.fit(X_train_final, y_train)
y_pred = model.predict(X_test_final)

print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.85      0.82      0.83      3972
    positive       0.71      0.76      0.74      2375

    accuracy                           0.80      6347
   macro avg       0.78      0.79      0.78      6347
weighted avg       0.80      0.80      0.80      6347



In [None]:
import emoji
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def analyze_sentiment_no_neutral(text):
    text = emoji.demojize(text)
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    
    # remove 'neu' key
    filtered_scores = {k: v for k, v in scores.items() if k != 'neu'}
    return filtered_scores

# Example
text = "xD"
print(analyze_sentiment_no_neutral(text))


{'neg': 0.0, 'pos': 1.0, 'compound': 0.5859}
