In [1]:
import pandas as pd
import os

In [2]:
# Define file prefixes in a list
file_prefixes = [
    "../English/Adhd_eng/",
    "../English/Anxiety_eng/",
    "../English/Asd_eng/",
    "../English/Bipolar_eng/",
    "../English/Control_eng/",
    "../English/Depression_eng/",
    "../English/Eating_eng/",
    "../English/Ocd_eng/",
    "../English/Ptsd_eng/",
    "../English/Schizophrenia_eng/"
]


In [3]:
# Initialize a list to hold all DataFrames
dataframes = []

# Loop through each prefix and read CSV files
for prefix in file_prefixes:
    # Create full paths and read CSV files into DataFrames
    full_paths = [os.path.join(prefix, file) for file in os.listdir(prefix) if file.endswith('.csv')]
    dataframes.extend(pd.read_csv(file) for file in full_paths)

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)

In [4]:
all_data

Unnamed: 0,class,tweet_id,day,time,tweet,tweet_favorite_count,tweet_retweet_count,tweet_source,user_id,user_followers_count,user_friends_count,user_listed_count,user_statuses_count
0,ADHD,1467992264270045193,2021-12-06,23:00:04,"""Miss this HTTPURL""",0,0,Twitter Web App,758856143460458497,565,383,2,2092
1,ADHD,1467990574879981568,2021-12-06,22:53:21,"""The doctor accepted my application and I have...",0,0,Twitter Web App,758856143460458497,565,383,2,2092
2,ADHD,1464801014159228932,2021-11-28,03:39:11,"""Im back again yay HTTPURL""",1,0,Twitter Web App,758856143460458497,565,383,2,2092
3,ADHD,1402116346016632832,2021-06-08,04:12:41,"""Thats great though I hope my mutuals are in r...",1,0,Twitter for iPhone,758856143460458497,565,383,2,2092
4,ADHD,1402116229494710277,2021-06-08,04:12:13,"""Damn I havent come on here since last year an...",1,0,Twitter for iPhone,758856143460458497,565,383,2,2092
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5170488,SCHIZOPHRENIA,1489028693867569156,2022-02-03,00:11:20,"""@USER It's something I definitely have to loo...",0,0,Twitter for Android,1404917788154777605,695,377,6,4997
5170489,SCHIZOPHRENIA,1489013271982227457,2022-02-02,23:10:03,"""I have some PTSD symptoms like flashbacks, av...",1,0,Twitter for Android,1404917788154777605,695,377,6,4997
5170490,SCHIZOPHRENIA,1489012728018710529,2022-02-02,23:07:53,"""The biggest trigger for my OCD compulsions is...",2,0,Twitter for Android,1404917788154777605,695,377,6,4997
5170491,SCHIZOPHRENIA,1489010994995150854,2022-02-02,23:01:00,"""He's a disgusting person. Just thinking about...",2,0,Twitter for Android,1404917788154777605,695,377,6,4997


In [5]:
import re
from rapidfuzz import process
from multiprocessing import Pool

In [35]:

informal_mapping = {
    "u": "you",
    "im": "i am",
    "ur": "your",
    "bc": "because",
    "urself": "yourself",
    "lmao": "laughing my ass off",
    "brb": "be right back",
    "gtg": "got to go",
    "ttyl": "talk to you later",
    "omg": "oh my god",
    "idk": "I don't know",
    "bff": "best friends forever",
    "fyi": "for your information",
    "imo": "in my opinion",
    "tbh": "to be honest",
    "smh": "shaking my head",
    "lol": "laugh out loud",
    "rofl": "rolling on the floor laughing",
    "wtf": "what the fuck",
    "np": "no problem",
    "thx": "thanks",
    "pls": "please",
    "xoxo": "hugs and kisses",
    "cya": "see you",
    "lmk": "let me know",
    "wyd": "what are you doing",
    "bday": "birthday",
    "hbd": "happy birthday",
    "sry": "sorry",
    "k": "okay",
    "jk": "just kidding",
    "ik": "i know",
    "tmi": "too much information",
    "b4": "before",
    "gr8": "great",
    'bf': 'boyfriend',
    'tl': 'timeline',
    'lmfao': 'laughing my ass off',
    'wtf': 'what the fuck',
    'lol': 'laugh out loud',
    'rofl': 'rolling on the floor laughing',
    'wyd': 'what are you doing',
    'bday': 'birthday',
    'hbd': 'happy birthday',
    'sry': 'sorry',
    'its': 'it is',
    'k': 'okay',
    'jk': 'just kidding',
    'ik': 'i know',
    "i've": "i have",
    "ive": "i have",
    "i'll": "i will",
    "ill": "i will",
    "i'd": "i would",
    "id": "i would",
    "didnt": "did not",
    "dont": "do not",
    "doesnt": "does not",
    "doesnt": "does not",
    
    
}

def clean_tweet(tweet):
    # Convert to lowercase
    tweet = tweet.lower()
    
    
    # Remove special characters
    tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)
    
    # Remove repeating characters (e.g., "aaaa" or "###")
    tweet = re.sub(r'(\w)\1{2,}', r'\1', tweet)  # For repeating letters
    tweet = re.sub(r'(\W)\1{2,}', r'\1', tweet)  # For repeating non-word characters
    

    # Split into words
    words = tweet.split()
    
    # Replace informal terms with fuzzy matching
    for i, word in enumerate(words):
        matched = process.extractOne(word, informal_mapping.keys())
        if matched:  # Check if a match was found
            matched_word, score, _ = matched  # Unpack only the first two elements
            if score >= 93:  # Adjust threshold as needed
                words[i] = informal_mapping[matched_word]
    
    # Join the words back into a single string and remove extra whitespace
    tweet = ' '.join(words)
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    
    return tweet

def process_tweets(tweets):
    
    with Pool() as pool:
        cleaned_tweets = pool.map(clean_tweet, tweets)
    return pd.DataFrame({'tweet': cleaned_tweets})

In [36]:
all_data["tweet"].loc[1000:1010].tolist()

['"my one (1) feeling"',
 '"anyways Im being actively ignored its its really hurting my feeling"',
 '"Ive decided that Im oversharing on my main acc and its unacceptable so Ill be making the attempt to move my sadness here"',
 '"look at u  doing that HTTPURL"',
 '"anyway I\'ve changed my mind"',
 '"moving on then"',
 '"I\'m too childish for a relationship I have just come to this conclusion"',
 '"I have not been sad in a while!!! feels weird but good!!"',
 '"It\'s too early!!!!"',
 '"I skipped school yesterday bc I didn\'t feel well and needed some sleep and I didn\'t do my assignments 😍"',
 '"henlo"']

In [37]:
process_tweets(all_data["tweet"].loc[1000:1010])

Unnamed: 0,tweet
0,my one 1 feeling
1,anyways i am being actively ignored it is it i...
2,i have decided that i am oversharing on my mai...
3,look at you doing that httpurl
4,anyway i have changed my mind
5,moving on then
6,i am too childish for a relationship i have ju...
7,i have not been sad in a while feels weird but...
8,it is too early
9,i skipped school yesterday because i did not f...
