# Twitter Sentiment Analysis
## Dataset Preprocessing

### Import Libraries

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import spacy

### Load datasets

In [2]:
# Save dataset paths into variables
train_df_path = "datasets/twitter_training.csv"
val_df_path = "datasets/twitter_validation.csv"

# Define column names and load datasets from path variables
columns = ['Tweet_ID', 'Entity', 'Sentiment', 'Tweet_Content'] 
train_df = pd.read_csv(train_df_path, header=None, names=columns)
val_df = pd.read_csv(val_df_path, header=None, names=columns)

# Print original dataset shapes
print(f"Original shape of training dataset: {train_df.shape}")
print(f"Original shape of validation dataset: {val_df.shape}")

Original shape of training dataset: (74682, 4)
Original shape of validation dataset: (1000, 4)


### Clean and Preprocess data

In [3]:
# Drop rows in training dataset that do not have content in 'Tweet_Content'
train_df = train_df.dropna(subset=['Tweet_Content'])
val_df = val_df.dropna(subset=['Tweet_Content'])

# Convert 'Irrelevant' sentiments to neutral
train_df['Sentiment'] = train_df['Sentiment'].replace('Irrelevant', 'Neutral')
val_df['Sentiment'] = val_df['Sentiment'].replace('Irrelevant', 'Neutral')

# Print dataset shapes after dropping null rows and merging 'Irrelevant' with 'Neutral'
print(f"Training dataset - Drop NULL and Merge: {train_df.shape}")
print(f"Validation dataset - Drop NULL and Merge: {val_df.shape}")

Training dataset - Drop NULL and Merge: (73996, 4)
Validation dataset - Drop NULL and Merge: (1000, 4)


In [4]:
# Remove duplicate rows from training dataset
train_df = train_df.drop_duplicates(subset=['Tweet_Content', 'Entity', 'Sentiment'])
val_df = val_df.drop_duplicates(subset=['Tweet_Content', 'Entity', 'Sentiment'])

print(f"Training dataset - Drop duplicates: {train_df.shape}")
print(f"Validation dataset - Drop duplicates: {val_df.shape}")

Training dataset - Drop duplicates: (70887, 4)
Validation dataset - Drop duplicates: (1000, 4)


### Setup SpaCy

In [5]:
# Load spacy and disable unwanted components
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [8]:
def clean_text_spacy(doc):
    tokens = []
    for token in doc:
        if (not token.is_stop or token.text.lower() in ['no', 'not', 'never', 'neither', 'nor']) \
            and not token.is_punct \
            and not token.like_url \
            and not token.like_email \
            and not token.is_space:
                tokens.append(token.lemma_.lower())
    return " ".join(tokens)

In [9]:
def process_batch(df, text_col):
    """Helper to process a dataframe in batches"""
    clean_texts = []
    for doc in nlp.pipe(df[text_col].astype(str).tolist(), batch_size=1000):
        clean_texts.append(clean_text_spacy(doc))
    return clean_texts

In [11]:
print("Cleaning training data...")
train_df['clean_text'] = process_batch(train_df, 'Tweet_Content')

print("Cleaning validation data...")
val_df['clean_text'] = process_batch(val_df, 'Tweet_Content')

Cleaning training data...
Cleaning validation data...


#### Label encoding

In [18]:
# Convert 'Positive', 'Negative', 'Neutral' to 0, 1, 2
le = LabelEncoder()

# Fit on training data to learn the mapping
train_df['label_encoded'] = le.fit_transform(train_df['Sentiment'])

# Transform validation data using same mapping
val_df['label_encoded'] = le.transform(val_df['Sentiment'])

print(f"Label Mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

Label Mapping: {'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}


### Split training data into Train/Test

In [22]:
train_split, test_split = train_test_split(
    train_df,
    test_size = 0.2,
    random_state = 42,
    stratify = train_df['Sentiment']
)

# Save preprocessed CSV

train_split.to_csv('datasets/train_preprocessed.csv', index=False)
test_split.to_csv('datasets/test_preprocessed.csv', index=False)
val_df.to_csv('datasets/validation_preprocessed.csv', index=False)

print("Processing complete. Files saved.\n")
print(f"train_preprocessed.csv = {train_split.shape}")
print(f"test_preprocessed.csv = {test_split.shape}")
print(f"validation_preprocessed.csv = {val_df.shape}")


Processing complete. Files saved.

train_preprocessed.csv = (56709, 6)
test_preprocessed.csv = (14178, 6)
validation_preprocessed.csv = (1000, 6)
