In [None]:
import pandas as pd
import os

In [None]:
# Path to the directory containing the CSV files
data_directory = "../Original Reddit Data/Labelled Data"  # Update this to your directory path

# List of CSV files to combine
csv_files = ["LD DA 1.csv", "LD EL1.csv", "LD PF1.csv", "LD TS 1.csv"]

# Create an empty list to store individual dataframes
dataframes = []

# Read each CSV file and append to the list
for file in csv_files:
    file_path = os.path.join(data_directory, file)
    # Read the file
    df = pd.read_csv(file_path)
    df['source_file'] = file
    dataframes.append(df)
    print(f"Loaded {file} with shape {df.shape}")

# Combine all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# Drop the CAT1 column if it exists
if 'CAT 1' in combined_df.columns:
    combined_df = combined_df.drop('CAT 1', axis=1)
    print("Dropped CAT 1 column")

# Display information about the combined dataframe
print(f"\nCombined DataFrame Shape: {combined_df.shape}")
print("\nCombined DataFrame Columns:")
print(combined_df.columns.tolist())

# Display a sample of the combined data
combined_df.head()

Loaded LD DA 1.csv with shape (223, 6)
Loaded LD EL1.csv with shape (200, 7)
Loaded LD PF1.csv with shape (200, 7)
Loaded LD TS 1.csv with shape (200, 7)
Dropped CAT 1 column

Combined DataFrame Shape: (823, 6)

Combined DataFrame Columns:
['score', 'selftext', 'subreddit', 'title', 'Label', 'source_file']


Unnamed: 0,score,selftext,subreddit,title,Label,source_file
0,1.0,Tried to watch this documentary “anxious Ameri...,Anxiety,Do people get over anxiety?,Drug and Alcohol,LD DA 1.csv
1,1.0,"i’m currently laying in bed wide awake, feelin...",Anxiety,does anyone else have this big fear of suddenl...,Drug and Alcohol,LD DA 1.csv
2,2.0,Second time trying weed. First time felt close...,Anxiety,3 hour long panic attack after trying weed,Drug and Alcohol,LD DA 1.csv
3,1.0,"I am not posting this for me, but rather for m...",Anxiety,Please leave in the comments ANYTHING that has...,Drug and Alcohol,LD DA 1.csv
4,1.0,21 year old male been dealing with anxiety eve...,Anxiety,Alcohol induced,Drug and Alcohol,LD DA 1.csv


In [8]:
print("\nNull values in each column:")
print(combined_df.isnull().sum())


Null values in each column:
score          23
selftext       23
subreddit      23
title          23
Label          23
source_file     0
dtype: int64


In [9]:
combined_df_clean = combined_df.dropna()
print(f"\nRemoved {combined_df.shape[0] - combined_df_clean.shape[0]} rows with null values")
print(f"Clean DataFrame Shape: {combined_df_clean.shape}")


Removed 23 rows with null values
Clean DataFrame Shape: (800, 6)


In [10]:
combined_df_clean.to_csv("data/combined_labeled_data_clean.csv", index=False)
print("Clean dataset saved to combined_labeled_data_clean.csv")

Clean dataset saved to combined_labeled_data_clean.csv
