In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load existing train and val splits
train_df = pd.read_csv("train_clean.csv")
val_df = pd.read_csv("val_clean.csv")

# Step 2: Combine them
combined_df = pd.concat([train_df, val_df], ignore_index=True)

# Step 3: Columns to drop
columns_to_drop = [
    "bisexual", "heterosexual","homosexual_gay_or_lesbian", "intellectual_or_learning_disability","other_disability", "other_sexual_orientation","physical_disability", "psychiatric_or_mental_illness", "publication_id", "parent_id","article_id", "rating", "funny", "wow", "sad", "likes", "disagree", "worker_x", "worker_y", "sexual_orientation" , "disability"
]

# Step 4: Filter and drop only existing columns
existing_to_drop = [col for col in columns_to_drop if col in combined_df.columns]
combined_df.drop(columns=existing_to_drop, inplace=True)

# Step 5: Print dropped columns
print(f"🗑️ Dropped columns: {existing_to_drop}")

🗑️ Dropped columns: ['bisexual', 'heterosexual', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', 'other_disability', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'worker_x', 'worker_y', 'sexual_orientation', 'disability']


In [16]:
print(combined_df.head(10))

        id    target                                       comment_text  \
0  5193512  0.166667  People would rather waste money on parks. The ...   
1  5089113  0.000000  When JR opens her mouth her face gets contorte...   
2  5854622  0.000000  The more the antifa show up and cause chaos at...   
3  6240221  0.166667  400 years of colonialism?\n\nMy daughter's eat...   
4  5171466  0.000000  The Globe knows the fate of Charlie  Hebdo is ...   
5  5907740  0.000000  So you think justice is punishing people for t...   
6   461324  0.000000  Frankly, I couldn't care less who North Korea ...   
7   787161  0.000000  What has your personal issue about an inherita...   
8   536860  0.000000  I think you need to smoke a bowl and chill. Pe...   
9  5748705  0.200000  DLNR is too incompetent to manage harbor impro...   

   severe_toxicity  obscene_x  identity_attack_x  insult_x  threat_x  asian  \
0              0.0        0.0           0.000000  0.166667       0.0    NaN   
1              0

In [17]:
# Rename columns as requested
combined_df.rename(columns={
    'obscene_x': 'obscene',
    'identity_attack_x': 'identity_attack',
    'insult_x': 'insult',
    'threat_x': 'threat',
    'sexual_explicit_x': 'sexual_explicit',
    'toxic': 'toxic_label',
    'severe_toxic': 'severe_toxic_label',
    'identity_attack_y': 'identity_attack_label',
    'insult_y': 'insult_label',
    'obscene_y': 'obscene_label',
    'sexual_explicit_y': 'sexual_explicit_label',
    'threat_y': 'threat_label',
    'gender': 'gender_label',
    'race_or_ethnicity': 'race_or_ethnicity_label',
    'religion': 'religion_label'
}, inplace=True)

In [18]:
print(combined_df.head(10))

        id    target                                       comment_text  \
0  5193512  0.166667  People would rather waste money on parks. The ...   
1  5089113  0.000000  When JR opens her mouth her face gets contorte...   
2  5854622  0.000000  The more the antifa show up and cause chaos at...   
3  6240221  0.166667  400 years of colonialism?\n\nMy daughter's eat...   
4  5171466  0.000000  The Globe knows the fate of Charlie  Hebdo is ...   
5  5907740  0.000000  So you think justice is punishing people for t...   
6   461324  0.000000  Frankly, I couldn't care less who North Korea ...   
7   787161  0.000000  What has your personal issue about an inherita...   
8   536860  0.000000  I think you need to smoke a bowl and chill. Pe...   
9  5748705  0.200000  DLNR is too incompetent to manage harbor impro...   

   severe_toxicity  obscene  identity_attack    insult  threat  asian  \
0              0.0      0.0         0.000000  0.166667     0.0    NaN   
1              0.0      0.0 

In [23]:
# Step 1: Convert 'created_date' safely
combined_df['created_date'] = pd.to_datetime(combined_df['created_date'], errors='coerce')

# Step 2: Drop invalid dates and log
before_drop = len(combined_df)
combined_df = combined_df.dropna(subset=['created_date'])
after_drop = len(combined_df)
dropped_rows = before_drop - after_drop

print(f"🗑️ Dropped {dropped_rows} rows due to invalid 'created_date' parsing.")

# Step 3: Remove timezone + microseconds for consistency
combined_df['created_date'] = combined_df['created_date'].dt.tz_localize(None).dt.floor('s')

# Step 4: Sort by date in descending order (latest first)
combined_df = combined_df.sort_values(by='created_date', ascending=False)



# Replace NaN in object (string/categorical) columns with 'none'
object_cols = combined_df.select_dtypes(include='object').columns
combined_df[object_cols] = combined_df[object_cols].fillna("none")

# Replace NaN in numeric columns with 0
numeric_cols = combined_df.select_dtypes(include=['number']).columns
combined_df[numeric_cols] = combined_df[numeric_cols].fillna(0)

# Step 5: Split latest 10% for validation
val_size = int(len(combined_df) * 0.10)
val_split = combined_df.iloc[:val_size]  # latest 10%
train_split = combined_df.iloc[val_size:]  # rest

# Step 7: Print the first 10 rows of both splits to check
print("✅ First 10 rows of validation set:")
print(val_split.head(10))

print("\n✅ First 10 rows of training set:")
print(train_split.head(10))

# Step 8: Summary
print(f"✅ Split complete (latest first):")
print(f"  ➤ Dropped rows: {dropped_rows}")
print(f"  ➤ Training samples: {len(train_split)}")
print(f"  ➤ Validation samples: {len(val_split)}")

🗑️ Dropped 0 rows due to invalid 'created_date' parsing.
✅ First 10 rows of validation set:
              id    target                                       comment_text  \
560861   6334010  0.000000  Students defined as EBD are legally just as di...   
664042   6334009  0.621212  Anyone who is quoted as having the following e...   
647222   6333982  0.000000  thank you ,,,right or wrong,,, i am following ...   
1552847  6333969  0.000000  What do you call people who STILL think the di...   
1663933  6333967  0.000000  Maybe the tax on "things" would be collected w...   
1631549  6333963  0.200000  If Alberta had given them nothing, you'd STILL...   
757400   6333965  0.000000  cont....GBA:  "here's the summation of that "l...   
315153   6333957  0.300000               There's no whine like Alberta whine!   
491873   6333950  0.200000  Nah, I am too boring to parody.  This guy Camp...   
1396648  6333955  0.166667  Payette's point of view is legitimate and I th...   

         severe_

In [24]:
# Step 6: Save to disk
train_split.to_csv("train.csv", index=False)
val_split.to_csv("val.csv", index=False)
from google.colab import files
files.download('train.csv')
files.download('val.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
test_df = pd.read_csv("test.csv")

# Step 3: Columns to drop
columns_to_drop = [
    "bisexual", "heterosexual","homosexual_gay_or_lesbian", "intellectual_or_learning_disability","other_disability", "other_sexual_orientation","physical_disability", "psychiatric_or_mental_illness", "publication_id", "parent_id","article_id", "rating", "funny", "wow", "sad", "likes", "disagree", "sexual_orientation" , "disability"
]

# Step 4: Filter and drop only existing columns
existing_to_drop = [col for col in columns_to_drop if col in test_df.columns]
test_df.drop(columns=existing_to_drop, inplace=True)

# Step 5: Print dropped columns
print(f"🗑️ Dropped columns: {existing_to_drop}")

print(test_df.head(10))

🗑️ Dropped columns: ['bisexual', 'heterosexual', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', 'other_disability', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree']
        id                                       comment_text  \
0  7000000  Jeff Sessions is another one of Trump's Orwell...   
1  7000001  I actually inspected the infrastructure on Gra...   
2  7000002  No it won't . That's just wishful thinking on ...   
3  7000003  Instead of wringing our hands and nibbling the...   
4  7000004  how many of you commenters have garbage piled ...   
5  7000005  Why can't the Globe & Mail provide the symbols...   
6  7000006  That's already been happening, Carl, it's call...   
7  7000007  imagine the costs for security, transportation...   
8  7000008  So they want to put United on the board of thi...   
9  7000009  No, no that can't 

In [5]:

# Replace NaN in object (string/categorical) columns with 'none'
object_cols = test_df.select_dtypes(include='object').columns
test_df[object_cols] = test_df[object_cols].fillna("none")

# Replace NaN in numeric columns with 0
numeric_cols = test_df.select_dtypes(include=['number']).columns
test_df[numeric_cols] = test_df[numeric_cols].fillna(0)

print("\n First 10 rows of training set:")
print(test_df.head(10))


 First 10 rows of training set:
        id                                       comment_text  \
0  7000000  Jeff Sessions is another one of Trump's Orwell...   
1  7000001  I actually inspected the infrastructure on Gra...   
2  7000002  No it won't . That's just wishful thinking on ...   
3  7000003  Instead of wringing our hands and nibbling the...   
4  7000004  how many of you commenters have garbage piled ...   
5  7000005  Why can't the Globe & Mail provide the symbols...   
6  7000006  That's already been happening, Carl, it's call...   
7  7000007  imagine the costs for security, transportation...   
8  7000008  So they want to put United on the board of thi...   
9  7000009  No, no that can't be.  It's Russia that's the ...   

                    created_date  toxicity  severe_toxicity   obscene  \
0  2017-01-26 07:37:38.422417+00  0.200000              0.0  0.000000   
1  2016-12-03 20:38:21.204649+00  0.000000              0.0  0.000000   
2  2017-05-05 18:07:58.560078+00

In [6]:
test_df.to_csv("test_clean.csv", index=False)


In [7]:
from google.colab import files
files.download('test_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>