DATA CLEANING AND BALANCING 

In [29]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

Step 1: Load the dataset

In [30]:
file_path = 'uncleaned.csv'
data = pd.read_csv(file_path)

# Display the top 10 rows of the uncleaned dataset
uncleaned_data = pd.read_csv(file_path)
print(uncleaned_data.head(10))

   label                                            comment       author  \
0      0                                         NC and NH.    Trumpbart   
1      0  You do know west teams play against west teams...    Shbshb906   
2      0  They were underdogs earlier today, but since G...     Creepeth   
3      0  This meme isn't funny none of the "new york ni...    icebrotha   
4      0                    I could use one of those tools.    cush2push   
5      0  I don't pay attention to her, but as long as s...  only7inches   
6      0      Trick or treating in general is just weird...  only7inches   
7      0                    Blade Mastery+Masamune or GTFO!    P0k3rm4s7   
8      0  You don't have to, you have a good build, buy ...   SoupToPots   
9      0                  I would love to see him at lolla.     chihawks   

            subreddit  score  ups  downs     date       created_utc  \
0            politics      2   -1     -1  2016-10  10/16/2016 23:55   
1                 nba

Step 2: Remove duplicate rows

In [31]:
data = data.drop_duplicates()

Step 3: Retain only 'label' and 'comment' columns

In [32]:
data = data[['label', 'comment']]

Step 4: Remove rows with missing or empty 'label' or 'comment'

In [33]:
data = data.dropna(subset=['label', 'comment'])
data = data[data['comment'].str.strip() != '']

Step 5: Ensure 'label' column contains only 0s and 1s

In [34]:
data = data[(data['label'] == 0) | (data['label'] == 1)]

Step 6: Balance the number of 0s and 1s in the 'label' column

In [35]:
# Count the number of 0s and 1s
count_0 = data['label'].value_counts()[0]
count_1 = data['label'].value_counts()[1]
# Determine the smaller count to balance the dataset
min_count = min(count_0, count_1)
# Randomly sample min_count number of 0s and 1s
data_0 = data[data['label'] == 0].sample(min_count, random_state=1)
data_1 = data[data['label'] == 1].sample(min_count, random_state=1)
# Concatenate the balanced data
balanced_data = pd.concat([data_0, data_1])
# Shuffle the balanced data
balanced_data = balanced_data.sample(frac=1, random_state=1).reset_index(drop=True)

Step 7: Save the cleaned and balanced dataset to a new CSV file

In [36]:
cleaned_file_path = 'cleaned_balanced_dataset.csv'
balanced_data.to_csv(cleaned_file_path, index=False)

Display:

In [37]:
# Display the count of 0s and 1s in the 'label' column
label_counts = balanced_data['label'].value_counts()
print("Count of 0s and 1s in the 'label' column:")
print(label_counts)

print(f"Cleaned and balanced dataset saved to {cleaned_file_path}")

# Display the top 10 rows of the cleaned dataset
cleaned_data = pd.read_csv(cleaned_file_path)
print(cleaned_data.head(10))

Count of 0s and 1s in the 'label' column:
label
0    65015
1    65015
Name: count, dtype: int64
Cleaned and balanced dataset saved to cleaned_balanced_dataset.csv
   label                                            comment
0      0                               All time or current?
1      0  I'm also hearing the white noice, i'm not sure...
2      1                          You need 8 cameras though
3      0                           Trump as Frey confirmed.
4      1  I didn't know that Flevoland was bigger in 186...
5      1  The thing you should put at the top of your tr...
6      1  The only thing Ohio has going for it is that i...
7      0                                  Debo, warlock 389
8      1       They probably just don't want to get shot...
9      0                                      You saw that?


SPLITING THE CLEANED DATASET TO 80% TRAIN AND 20% TEST DATASETS 

In [38]:
# Separate data into 0s and 1s
data_0 = balanced_data[balanced_data['label'] == 0]
data_1 = balanced_data[balanced_data['label'] == 1]

# Split each into 80% train and 20% test
train_0, test_0 = train_test_split(data_0, test_size=0.2, random_state=1)
train_1, test_1 = train_test_split(data_1, test_size=0.2, random_state=1)

# Combine the training and testing sets
train_data = pd.concat([train_0, train_1])
test_data = pd.concat([test_0, test_1])

# Shuffle the combined training and testing datasets
train_data = train_data.sample(frac=1, random_state=1).reset_index(drop=True)
test_data = test_data.sample(frac=1, random_state=1).reset_index(drop=True)

# Save the training and testing datasets to new CSV files
train_file_path = 'train_dataset.csv'
test_file_path = 'test_dataset.csv'

train_data.to_csv(train_file_path, index=False)
test_data.to_csv(test_file_path, index=False)

In [39]:
# Display the count of 0s and 1s in the training and testing datasets
train_label_counts = train_data['label'].value_counts()
test_label_counts = test_data['label'].value_counts()

print("Count of 0s and 1s in the 'label' column of the training dataset:")
print(train_label_counts)

print("Count of 0s and 1s in the 'label' column of the testing dataset:")
print(test_label_counts)

print(f"Training dataset saved to {train_file_path}")
print(f"Testing dataset saved to {test_file_path}")



# Display the top 10 rows of the cleaned train dataset
print("Display the top 10 rows of the cleaned TRAIN dataset:")
train_data = pd.read_csv(train_file_path)
print(train_data.head(10))

# Display the top 10 rows of the cleaned test dataset
print("Display the top 10 rows of the cleaned TEST dataset")
test_data = pd.read_csv(test_file_path)
print(test_data.head(10))

Count of 0s and 1s in the 'label' column of the training dataset:
label
1    52012
0    52012
Name: count, dtype: int64
Count of 0s and 1s in the 'label' column of the testing dataset:
label
1    13003
0    13003
Name: count, dtype: int64
Training dataset saved to train_dataset.csv
Testing dataset saved to test_dataset.csv
Display the top 10 rows of the cleaned TRAIN dataset:
   label                                            comment
0      1  Everyone knows point guards average less rebou...
1      1            Was this meant for r/TheModdingOfIsaac?
2      1  I'd say probably overall would be Derm: the lo...
3      1                 Bullshit its got a kvlt album art!
4      1                                  You forgot to add
5      1                  Well, he still has about a month.
6      1  I'm sure the Donald will have something to say...
7      0  If your values lead you to helping hand this c...
8      0  I'm really confused, isn't carnival celebrated...
9      1    Like all 