In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
scraped_data = pd.read_csv("data/filled_10k.csv")

In [3]:
missing_tweets_percentage = scraped_data['tweet_text'].isna().mean() * 100

offensive_missing_percentage = scraped_data[scraped_data['offense'] == 1]['tweet_text'].isna().mean() * 100

non_offensive_missing_percentage = scraped_data[scraped_data['offense'] == 0]['tweet_text'].isna().mean() * 100

print(f"Percentage of missing tweets: {missing_tweets_percentage:.2f}%")
print(f"Percentage of missing offensive tweets: {offensive_missing_percentage:.2f}%")
print(f"Percentage of missing non-offensive tweets: {non_offensive_missing_percentage:.2f}%")

Percentage of missing tweets: 14.88%
Percentage of missing offensive tweets: 16.70%
Percentage of missing non-offensive tweets: 14.07%


## Preprocess Data

In [4]:
def mask_usernames(tweet):
    text = re.sub('@[^\s]+', '@user', tweet)
    return text.strip()

In [5]:
clean_data = scraped_data[~scraped_data["tweet_text"].isna()]
clean_data.loc[:,"tweet_text"] = clean_data.loc[:,"tweet_text"].apply(mask_usernames)

# clean_data["tweet_text"].values.tolist() # Sanity Check

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## Split Data
stratified by 'offense'

In [6]:
train_df, temp_df = train_test_split(clean_data, test_size=0.2, random_state=14, stratify=clean_data["offense"])

In [7]:
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=14, stratify=temp_df["offense"])

In [8]:
len_train = len(train_df)
len_test = len(test_df)
len_val = len(val_df)

(len_train, len_test, len_val)

(6809, 851, 852)

In [9]:
# Counts for each class
print(train_df["offense"].value_counts().to_dict())
print(val_df["offense"].value_counts().to_dict())
print(test_df["offense"].value_counts().to_dict())

{0: 4762, 1: 2047}
{0: 596, 1: 256}
{0: 595, 1: 256}


## Export

In [10]:
train_df.to_csv("data/splits/train.csv", index=False)
test_df.to_csv("data/splits/test.csv", index=False)
val_df.to_csv("data/splits/val.csv", index=False)