In [3]:
import pandas as pd
import os

In [4]:
# Define file prefixes in a list
file_prefixes = [
    "../English/Adhd_eng/",
    "../English/Anxiety_eng/",
    "../English/Asd_eng/",
    "../English/Bipolar_eng/",
    "../English/Control_eng/",
    "../English/Depression_eng/",
    "../English/Eating_eng/",
    "../English/Ocd_eng/",
    "../English/Ptsd_eng/",
    "../English/Schizophrenia_eng/"
]


In [5]:
# Initialize a list to hold all DataFrames
dataframes = []

# Loop through each prefix and read CSV files
for prefix in file_prefixes:
    # Create full paths and read CSV files into DataFrames
    full_paths = [os.path.join(prefix, file) for file in os.listdir(prefix) if file.endswith('.csv')]
    dataframes.extend(pd.read_csv(file) for file in full_paths)

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)

In [6]:
all_data

Unnamed: 0,class,tweet_id,day,time,tweet,tweet_favorite_count,tweet_retweet_count,tweet_source,user_id,user_followers_count,user_friends_count,user_listed_count,user_statuses_count
0,ADHD,1467992264270045193,2021-12-06,23:00:04,"""Miss this HTTPURL""",0,0,Twitter Web App,758856143460458497,565,383,2,2092
1,ADHD,1467990574879981568,2021-12-06,22:53:21,"""The doctor accepted my application and I have...",0,0,Twitter Web App,758856143460458497,565,383,2,2092
2,ADHD,1464801014159228932,2021-11-28,03:39:11,"""Im back again yay HTTPURL""",1,0,Twitter Web App,758856143460458497,565,383,2,2092
3,ADHD,1402116346016632832,2021-06-08,04:12:41,"""Thats great though I hope my mutuals are in r...",1,0,Twitter for iPhone,758856143460458497,565,383,2,2092
4,ADHD,1402116229494710277,2021-06-08,04:12:13,"""Damn I havent come on here since last year an...",1,0,Twitter for iPhone,758856143460458497,565,383,2,2092
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5170488,SCHIZOPHRENIA,1489028693867569156,2022-02-03,00:11:20,"""@USER It's something I definitely have to loo...",0,0,Twitter for Android,1404917788154777605,695,377,6,4997
5170489,SCHIZOPHRENIA,1489013271982227457,2022-02-02,23:10:03,"""I have some PTSD symptoms like flashbacks, av...",1,0,Twitter for Android,1404917788154777605,695,377,6,4997
5170490,SCHIZOPHRENIA,1489012728018710529,2022-02-02,23:07:53,"""The biggest trigger for my OCD compulsions is...",2,0,Twitter for Android,1404917788154777605,695,377,6,4997
5170491,SCHIZOPHRENIA,1489010994995150854,2022-02-02,23:01:00,"""He's a disgusting person. Just thinking about...",2,0,Twitter for Android,1404917788154777605,695,377,6,4997


In [20]:
import re
from concurrent.futures import ProcessPoolExecutor
from rapidfuzz import process, fuzz

In [21]:
import json

# Load the informal mapping from JSON
with open('informal_mapping.json', 'r') as f:
    informal_mapping = json.load(f)

In [22]:
# Create a set of informal words for faster lookups
INFORMAL_WORDS = set(informal_mapping.keys())

def fuzzy_replace(word):
    # First try exact match for speed
    if word in informal_mapping:
        return informal_mapping[word]

    # Only do fuzzy matching if word is likely informal
    if len(word) > 2:  # Skip very short words
        matched = process.extractOne(
            word,
            INFORMAL_WORDS,
            scorer=fuzz.WRatio,
            score_cutoff=93  # Increased threshold for better precision
        )
        if matched:
            return informal_mapping[matched[0]]
    return word


In [23]:
URL_PATTERN = re.compile(r'https?://\S+')
USER_PATTERN = re.compile(r'@\w+')
SPECIAL_CHARS = re.compile(r'[^a-zA-Z0-9\s]')
REPEAT_CHARS = re.compile(r'(\w)\1{2,}')
REPEAT_NON_WORD = re.compile(r'(\W)\1{2,}')
EXTRA_SPACES = re.compile(r'\s+')

def clean_tweet(tweet):
    try:
        # Extract and preserve URLs and user mentions
        urls = URL_PATTERN.findall(tweet)
        users = USER_PATTERN.findall(tweet)
        
        # Remove URLs and user mentions
        tweet = URL_PATTERN.sub('', tweet)
        tweet = USER_PATTERN.sub('', tweet)
        
        # Convert to lowercase
        tweet = tweet.lower()
        
        # Remove special characters and repeating characters
        tweet = SPECIAL_CHARS.sub('', tweet)
        tweet = REPEAT_CHARS.sub(r'\1', tweet)
        tweet = REPEAT_NON_WORD.sub(r'\1', tweet)
        
        # Split into words
        words = tweet.split()
        
        # Check if tweet is too short
        if len(words) < 3:
            return None
            
        # Process words in batches for better performance
        words = [fuzzy_replace(word) for word in words]
        
        # Join words and clean up spaces
        tweet = EXTRA_SPACES.sub(' ', ' '.join(words)).strip()
        
        # Combine with preserved tokens
        preserved_tokens = users + urls
        return ' '.join(preserved_tokens + [tweet]) if tweet else None
        
    except Exception:
        return None

In [24]:
def process_tweets(tweets, num_workers=None):
    # Use optimal number of workers
    if num_workers is None:
        num_workers = min(32, os.cpu_count() + 4)  # Optimal worker count
    
    # Process in larger chunks for better performance
    chunk_size = max(1000, len(tweets) // (num_workers * 2))
    
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        cleaned_tweets = list(executor.map(clean_tweet, tweets, chunksize=chunk_size))
    
    return cleaned_tweets

In [25]:
all_data["tweet"].loc[1000:1020].tolist()

['"my one (1) feeling"',
 '"anyways Im being actively ignored its its really hurting my feeling"',
 '"Ive decided that Im oversharing on my main acc and its unacceptable so Ill be making the attempt to move my sadness here"',
 '"look at u  doing that HTTPURL"',
 '"anyway I\'ve changed my mind"',
 '"moving on then"',
 '"I\'m too childish for a relationship I have just come to this conclusion"',
 '"I have not been sad in a while!!! feels weird but good!!"',
 '"It\'s too early!!!!"',
 '"I skipped school yesterday bc I didn\'t feel well and needed some sleep and I didn\'t do my assignments 😍"',
 '"henlo"',
 '"I\'m just hear to find out if I have more characters  I found out I do not"',
 '"Relationship pending"',
 '"He\'s so cute I haven\'t been sad in while"',
 '"I like boy"',
 '"@USER What u say"',
 '"I wish I stayed home"',
 '"he\'s funny HTTPURL"',
 '"I would be here for this I\'m not even gonna pretend I wouldn\'t be HTTPURL"',
 '"Henlo i am lone ly"',
 '"call me dramatic but I would a

In [26]:
process_tweets(all_data["tweet"].loc[1000:1020])

['my one 1 feeling',
 'anyways i am being actively ignored it is it is really hurting my feeling',
 'i have decided that i am oversharing on my main acc and it is unacceptable so i will be making the attempt to move my sadness here',
 'look at you doing that httpurl',
 'anyway i have changed my mind',
 'moving on then',
 'i am too childish for a relationship i have just come to this conclusion',
 'i have not been sad in a while feels weird but good',
 'it is too early',
 'i skipped school yesterday because i did not feel well and needed some sleep and i did not do my assignments',
 None,
 'i am just hear to find out if i have more characters i found out i do not',
 None,
 'hes so cute i have not been sad in while',
 'i like boy',
 '@USER what you say',
 'i wish i stayed home',
 'hes funny httpurl',
 'i would be here for this i am not even gonna pretend i would not be httpurl',
 'henlo i am lone ly',
 'call me dramatic but i would absolutely rather be emotionally distant than have a mel

In [27]:
# Clean all tweets
cleaned_tweets = process_tweets(all_data['tweet'])

# Create new DataFrame with cleaned tweets and classes
final_data = pd.DataFrame({
    'tweet': cleaned_tweets,
    'class': all_data['class']
}).dropna()

# Save to CSV
final_data.to_csv('cleaned_tweets.csv', index=False)

In [53]:
final_data

Unnamed: 0,tweet,class
0,miss this httpurl,ADHD
1,the doctor accepted my application and i have ...,ADHD
2,i am back again yay httpurl,ADHD
3,thats great though i hope my mutuals are in re...,ADHD
4,damn i havent come on here since last year and...,ADHD
...,...,...
5170488,user it is something i definitely have to look...,SCHIZOPHRENIA
5170489,i have some ptsd symptoms like flashbacks avoi...,SCHIZOPHRENIA
5170490,the biggest trigger for my ocd compulsions is ...,SCHIZOPHRENIA
5170491,hes a disgusting person just thinking about hi...,SCHIZOPHRENIA
