In [18]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [19]:

# 📌 Load
df = pd.read_csv('../Data/raw_dataset.csv')
print(df.head())
print(df.info())


   id  user_id             comment                        date  platform  \
0   1      328  You are so stupid!  2025-05-17 16:56:30.613829   Twitter   
1   2      380   Nobody likes you!  2025-02-23 16:56:30.613829  Facebook   
2   3      115       Just shut up!  2025-05-03 16:56:30.613829   Twitter   
3   4      347   You're worthless!  2024-10-07 16:56:30.613829   Twitter   
4   5      303       Such a loser!  2024-12-09 16:56:30.613829   Twitter   

   label  
0      1  
1      1  
2      1  
3      1  
4      1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3000 non-null   int64 
 1   user_id   3000 non-null   int64 
 2   comment   3000 non-null   object
 3   date      3000 non-null   object
 4   platform  3000 non-null   object
 5   label     3000 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 140.8+ KB
None


In [20]:
# 📌 Drop NULLs + Duplicates
df = df.dropna(subset=['comment', 'label'])
df = df.drop_duplicates(subset=['comment'])

# 📌 Fix inconsistent
df['platform'] = df['platform'].str.lower().str.strip()


In [21]:
# 📌 Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_comment'] = df['comment'].apply(clean_text)


In [22]:
# 📌 Length outliers
df['comment_length'] = df['clean_comment'].apply(len)
df = df[df['comment_length'] > 3]
df = df[df['comment_length'] < 500]

# 📌 Encode platform
df['platform_encoded'] = LabelEncoder().fit_transform(df['platform'])

df_final = df[['clean_comment', 'platform_encoded', 'label']]



In [24]:
# 📌 Split
X = df_final[['clean_comment', 'platform_encoded']]
y = df_final['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.to_csv('../Data/X_train.csv', index=False)
X_test.to_csv('../Data/X_test.csv', index=False)
y_train.to_csv('../Data/y_train.csv', index=False)
y_test.to_csv('../Data/y_test.csv', index=False)

print("✅ Preprocessing complete & saved.")
print("Train:", X_train.shape, "Test:", X_test.shape)

✅ Preprocessing complete & saved.
Train: (12, 2) Test: (4, 2)
