# Twitter Sentiment Classification: Positive vs. Negative

In [91]:
import pandas as pd

df_train = pd.read_csv('../data/twitter_sentiment_train.csv')
df_test  = pd.read_csv('../data/twitter_sentiment_test.csv')

In [92]:
int_to_label = {1: 'Positive', 0: 'Negative'}

In [93]:
df_train.head(5)

Unnamed: 0,text,label
0,"""If Paul Dunne wins the Open tomorrow as an am...",1
1,Spreading the word about our newest twilight m...,1
2,Tom Brady playing on Thursday makes the nfl se...,1
3,@user kris bryant is the 3rd best defensive 3b...,1
4,"it may be beyonce's bday, but we must not forg...",1


### Import libraries

In [94]:
import re
import string
from textblob import TextBlob
from sklearn.linear_model import SGDClassifier
from tqdm import tqdm
from sklearn.metrics import classification_report

from pre_processing import *

In [95]:
with open('../data/profanity.txt', 'r') as f: 
    profanity_words = f.readlines()
profanity_words = [s.strip() for s in profanity_words]

### Feature Extraction Checklist

1. Profanity words count
2. Sentiment and Subjectivity 
3. Emoji Sentiment + Emoticon e.g :), ðŸ˜‚, :((
3. Fully Capitalized
4. Punctuations

In [96]:
def count_all_capital_tokens(text: str) -> dict:
    """
    Counts the number of fully capitalized tokens (all letters uppercase) in a given text.
    Returns: {'all_capital_token_count': count}
    """
    matches = re.findall(r'\b[A-Z][A-Z]+\b', text)
    return {'all_capital_token_count': len(matches)}

def count_punctuation(text: str) -> dict:
    """
    Counts the occurrences of each punctuation mark in a given text.
    Returns: {'punctuation_char1': count1, 'punctuation_char2': count2, ...}
    """
    punct_occur = {}
    for char in string.punctuation:
        punct_occur[char] = 0
    for char in text:
        if char in string.punctuation:
            punct_occur[char] += 1
    return punct_occur

def count_profanity_words(text: str, profanity_list: list) -> dict:
    """
    Counts the number of profanity words in a given text using a predefined list.
    Returns: {'profanity_word_count': count}
    """
    count = 0
    # Normalize both input and the word list using the to_lower() function
    profanity_list = [s.lower() for s in profanity_list]
    tokenized_sent = run_pipeline(text, [word_tokenize_sentence, to_lower])
    for sent in tokenized_sent:
        for token in sent:
            if token in profanity_list:
                count += 1
    return {'profanity_word_count': count}

# TextBlob does not work on emojis !!!!
def get_sentiment_and_subjectivity(text: str) -> dict:
    """
    Returns the sentiment polarity and subjectivity scores of a given text using TextBlob.
    Returns: {
      "positive_sentiment": polarity if > 0, else 0,
      "negative_sentiment": |polarity| if < 0, else 0,
      "subjectivity": score
    }
    """
    blob = TextBlob(text)
    pol = blob.sentiment.polarity
    subj = blob.sentiment.subjectivity
    
    return {
        "positive_sentiment": pol if pol > 0 else 0,
        "negative_sentiment": abs(pol) if pol < 0 else 0,
        "subjectivity": subj
    }

In [97]:
def preprocessing_text(text):
    # Classical preprocessing steps
    text = twokenize.tokenizeRawTweetText(text)
    text = to_lower(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

In [98]:
feature_functions = [
    count_punctuation,
    lambda text: count_profanity_words(text, profanity_words),
    count_all_capital_tokens,
    get_sentiment_and_subjectivity
]

X_train = df_train.drop(columns='label')
X_test = df_test.drop(columns='label')

for func in tqdm(feature_functions):
    results = X_train['text'].apply(lambda x: func(str(x))).tolist()
    temp_df = pd.DataFrame(results)

    temp_df.reset_index(drop=True, inplace=True)
    X_train.reset_index(drop=True, inplace=True)
    X_train = pd.concat([X_train, temp_df], axis=1)

    results = X_test['text'].apply(lambda x: func(str(x))).tolist()
    temp_df = pd.DataFrame(results)

    temp_df.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    X_test = pd.concat([X_test, temp_df], axis=1)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:09<00:00,  2.47s/it]


In [99]:
X_train.head(5)

Unnamed: 0,text,!,"""",#,$,%,&,',(,),...,`,{,|,},~,profanity_word_count,all_capital_token_count,positive_sentiment,negative_sentiment,subjectivity
0,"""If Paul Dunne wins the Open tomorrow as an am...",0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.25,0.0,0.46
1,Spreading the word about our newest twilight m...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.875,0.0,0.9
2,Tom Brady playing on Thursday makes the nfl se...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.366667,0.0,0.466667
3,@user kris bryant is the 3rd best defensive 3b...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.5,0.0,0.15
4,"it may be beyonce's bday, but we must not forg...",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [100]:
X_train_features = X_train.drop(columns=['text'])
X_test_features = X_test.drop(columns=['text'])

y_train = df_train.drop(columns=['text'])
y_test = df_test.drop(columns=['text'])

model = SGDClassifier(
    loss='log_loss',
    learning_rate='constant',
    eta0=0.01,
    random_state=123
)

model.fit(X_train_features, y_train)
y_pred = model.predict(X_test_features)

  y = column_or_1d(y, warn=True)


In [102]:
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.81      0.69      0.75      3972
    positive       0.59      0.72      0.65      2375

    accuracy                           0.71      6347
   macro avg       0.70      0.71      0.70      6347
weighted avg       0.73      0.71      0.71      6347

