# Data Import and Filtering

Import raw data files and extract 500 datapoints from each file with above-average text length.

In [1]:
!pwd

/home/kuper/projects/datich/ml


In [2]:
import pandas as pd
import os
from pathlib import Path

# Define the data directory
data_dir = Path("data/raw")

# Get list of all CSV files
csv_files = sorted(data_dir.glob("*.csv"))
print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  - {f.name}")

# Load all data files
dfs = {}
for file in csv_files:
    dfs[file.stem] = pd.read_csv(file)
    print(f"Loaded {file.stem}: {len(dfs[file.stem])} rows")


Found 5 CSV files:
  - Swfeb22.csv
  - anxifeb22.csv
  - depfeb22.csv
  - lonefeb22.csv
  - mhfeb22.csv
Loaded Swfeb22: 13014 rows
Loaded anxifeb22: 6551 rows
Loaded depfeb22: 13152 rows
Loaded lonefeb22: 4725 rows
Loaded mhfeb22: 8629 rows


In [3]:
# Process each dataset: filter for above-average length text and take 500 samples
filtered_data = {}

for name, df in dfs.items():
    # Calculate text length (using 'selftext' column)
    df['text_length'] = df['selftext'].astype(str).str.len()
    # Calculate average length
    avg_length = df['text_length'].mean()
    print(f"\n{name}:")
    print(f"  Average text length: {avg_length:.2f} characters")
    
    # Filter for above-average length texts
    above_avg = df[df['text_length'] > avg_length//2]
    print(f"  Records above average: {len(above_avg)} / {len(df)}")
    
    # Take up to 500 samples
    sample_size = min(500, len(above_avg))
    filtered_data[name] = above_avg.sample(n=sample_size, random_state=42)
    print(f"  Samples taken: {sample_size}")
    
    # Show statistics
    print(f"  Min length: {filtered_data[name]['text_length'].min()}")
    print(f"  Max length: {filtered_data[name]['text_length'].max()}")
    print(f"  Mean length: {filtered_data[name]['text_length'].mean():.2f}")



Swfeb22:
  Average text length: 616.88 characters
  Records above average: 6499 / 13014
  Samples taken: 500
  Min length: 309.0
  Max length: 13098.0
  Mean length: 1073.30

anxifeb22:
  Average text length: 759.14 characters
  Records above average: 4013 / 6551
  Samples taken: 500
  Min length: 380.0
  Max length: 7801.0
  Mean length: 1111.24

depfeb22:
  Average text length: 683.65 characters
  Records above average: 7087 / 13152
  Samples taken: 500
  Min length: 343
  Max length: 5753
  Mean length: 1092.31

lonefeb22:
  Average text length: 497.07 characters
  Records above average: 2444 / 4725
  Samples taken: 500
  Min length: 252
  Max length: 7321
  Mean length: 871.61

mhfeb22:
  Average text length: 1005.11 characters
  Records above average: 5224 / 8629
  Samples taken: 500
  Min length: 505.0
  Max length: 9830.0
  Mean length: 1530.69


In [4]:
# Combine all filtered datasets
combined_data = pd.concat(filtered_data.values(), ignore_index=True)
print(f"\nTotal combined dataset: {len(combined_data)} samples")
print(f"\nDataset breakdown:")
for name in filtered_data:
    print(f"  {name}: {len(filtered_data[name])} samples")




Total combined dataset: 2500 samples

Dataset breakdown:
  Swfeb22: 500 samples
  anxifeb22: 500 samples
  depfeb22: 500 samples
  lonefeb22: 500 samples
  mhfeb22: 500 samples


In [5]:
combined_data

Unnamed: 0.1,Unnamed: 0,author,created_utc,score,selftext,subreddit,title,timestamp,text_length
0,11187,JynxOnFire,1643959578,1,I'm suicidal. Big shocker. I have no ambitions...,SuicideWatch,I miss the old me lmao,2022-02-04 18:26:18,588.0
1,3377,OkayConnection,1645408291,1,They have crept into my world again. I‚Äôm not ...,SuicideWatch,Ideations,2022-02-21 12:51:31,929.0
2,11775,iColtBrawl,1643851496,1,"17, can't keep working 9-5 type jobs. Just gon...",SuicideWatch,Not rich and I wanna kill myself,2022-02-03 12:24:56,309.0
3,3526,SainikJr,1645390146,1,I‚Äôm not the person who I thought i should be. ...,SuicideWatch,I need to die.,2022-02-21 07:49:06,415.0
4,11454,Jack_Ingoff123,1643923723,1,I can't take it anymore. I just want to rip ou...,SuicideWatch,I'm so miserable all the fucking time,2022-02-04 08:28:43,683.0
...,...,...,...,...,...,...,...,...,...
2495,3527,K-A-Mck,1645063050,1,Hi there. I have a chronic mental health probl...,mentalhealth,High functioning.,2022-02-17 12:57:30,813.0
2496,8080,Throwaway438901993,1643772780,1,For the past 5 months I've been struggling to ...,mentalhealth,I want to give up,2022-02-02 14:33:00,946.0
2497,1922,hey--world--,1645512773,1,I know being seeking advice from random strang...,mentalhealth,Am I depressed? (Serious),2022-02-22 17:52:53,1037.0
2498,2828,blacknwhitejedi,1645282759,1,I recently joined a functional medicine progra...,mentalhealth,I joined a functional medicine program and now...,2022-02-20 01:59:19,998.0


In [6]:
combined_data.columns

Index(['Unnamed: 0', 'author', 'created_utc', 'score', 'selftext', 'subreddit',
       'title', 'timestamp', 'text_length'],
      dtype='str')

In [7]:
combined_data.drop(columns=['text_length', 'Unnamed: 0', 'author', 'created_utc', 'score','title', 'timestamp', 'text_length'], inplace=True)

In [8]:
combined_data['subreddit'].values.unique()

<StringArray>
['SuicideWatch', 'Anxiety', 'depression', 'lonely', 'mentalhealth']
Length: 5, dtype: str

In [9]:
combined_data["selftext"] = combined_data["selftext"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

In [12]:
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
combined_data.to_csv("data/processed/to_be_labelled.csv",index=False)

# NEW TO BE LABELLED

In [2]:
# Create 5000 new samples that do not overlap with existing to_be_labelled.csv
import pandas as pd
from pathlib import Path

data_dir = Path("data/raw")
csv_files = sorted(data_dir.glob("*.csv"))

# Load all raw files
dfs = [pd.read_csv(file) for file in csv_files]
raw_data = pd.concat(dfs, ignore_index=True)

# Clean selftext and drop empties
raw_data["selftext"] = raw_data["selftext"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
raw_data = raw_data[raw_data["selftext"].str.len() > 0].copy()

# Load existing labelled candidates to avoid overlap
existing = pd.read_csv("data/processed/to_be_labelled.csv")
existing_texts = set(existing["selftext"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip())

# Remove any rows already present in existing file
candidates = raw_data[~raw_data["selftext"].isin(existing_texts)].copy()

print(f"Raw rows: {len(raw_data)}")
print(f"Existing rows: {len(existing)}")
print(f"Candidates after de-dup: {len(candidates)}")

# Sample 5000 new rows (or as many as available)
sample_size = min(7000, len(candidates_avg))
new_samples = candidates_avg.sample(n=sample_size, random_state=42).reset_index(drop=True)

# Keep only required columns if present
cols_to_keep = [col for col in ["selftext", "subreddit"] if col in new_samples.columns]
new_samples = new_samples[cols_to_keep]

new_samples


Unnamed: 0,selftext,subreddit
0,Feeling very bad. Managed to call a suicide pr...,SuicideWatch
1,Hello üëãüèΩ I was prescribed 25mg of trazodone to...,depression
2,I'm 21 years old and I've been in a wheelchair...,depression
3,"Sometimes when im trying to sleep, even though...",Anxiety
4,I'm 21. I am in college. Nothing is okay. my p...,SuicideWatch
...,...,...
4995,"Sorry, I don‚Äôt know if i‚Äôm suppose to ask this...",mentalhealth
4996,Most of the time I have suicide on my mind. At...,SuicideWatch
4997,Ive started restricting in the last month or s...,mentalhealth
4998,Hey 15 M I have a whole plan tonight I‚Äôm going...,SuicideWatch


In [None]:
new_samples.to_csv("data/processed/new_samplecsv", index=False)