In [5]:
import os
import re
import pandas as pd
import seaborn as sns
from pathlib import Path
from nltk.corpus import stopwords


# Constants & Configuration
METAPROJECT_NAME = 'TopicModelling_META'
SUBPROJECT_NAME = 'TopMod_pipeline'
DATASET_NAME = "SensoryTool_CombinedData.csv"
HIGH_SENSORY = True #set to False to get DeepListening experience (no stroboscope)
CONDITION = 'highsensory' if HIGH_SENSORY else 'deeplistening'

PROJDIR = os.path.expanduser(f"~/projects/{METAPROJECT_NAME}")
DATADIR = os.path.join(PROJDIR, f'DATA/{DATASET_NAME}')
CODEDIR = os.path.join(PROJDIR, f'{SUBPROJECT_NAME}')

print(f'Condition : "{CONDITION}"')

Condition : "highsensory"


In [6]:
df = pd.read_csv(DATADIR)
dataset = df[df['meta_HighSensory'] == HIGH_SENSORY]['reflection_answer']
reports = dataset[dataset.notna() & (dataset != '')].reset_index(drop=True)
reports = pd.DataFrame(reports)

print('N={} reports (HighSensory = {})'.format(len(reports), HIGH_SENSORY))
print(reports.head())
print(reports.info())
print(reports.describe())

N=336 reports (HighSensory = True)
                                   reflection_answer
0  Intense chaos. And then my mind checked out an...
1                                      good stuff.\n
2  a pattern of red and white lights thatflashed ...
3  i wentback to many hard and mostly beautiful m...
4                                              Hello
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 1 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   reflection_answer  336 non-null    object
dtypes: object(1)
memory usage: 2.8+ KB
None
       reflection_answer
count                336
unique               334
top              devtest
freq                   2


  df = pd.read_csv(DATADIR)


In [7]:
reports

Unnamed: 0,reflection_answer
0,Intense chaos. And then my mind checked out an...
1,good stuff.\n
2,a pattern of red and white lights thatflashed ...
3,i wentback to many hard and mostly beautiful m...
4,Hello
...,...
331,Changing temperature of my body with the light...
332,pleasure & intrigue
333,travelling through space\n
334,i thought alot about rands journey into rhudia...


In [9]:
# Required Libraries
import pandas as pd
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from spacy import load

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = load("en_core_web_sm")
nltk.download("stopwords")

# Handling typos
def correct_typos(text):
    blob = TextBlob(text)
    corrected_text = blob.correct() #automatically corrects the spelling of words in the text using a built-in method of TextBlob
    return str(corrected_text)

# Separating attached words
def separate_attached_words(text):
    return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", text)) #re.findall method uses a regex pattern to separate words based on capitalization rules.

# Removing single or 2 word eows (Assuming data is in a DataFrame)
def remove_single_word_rows(df, column_name,nwords=2):
    df = df[df[column_name].str.split().str.len() > nwords]
    return df

# Handling "/n" values
def handle_newlines(text):
    return text.replace("/n", " ")  # Replacing with a space, change " " to "\n" if you want actual newlines.

# Text Cleaning using NLTK and spaCy
def text_cleaning(text,stemming=False):
    # Tokenization using spaCy
    doc = nlp(text)
    tokens = [token.text for token in doc] #breaks the text into individual words or tokens

    # Removing Stop Words using NLTK
    # nltk.download("stopwords")
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming using NLTK
    if stemming:
        stemmer = PorterStemmer() #PorterStemmer reduce wordfs to their base or root form (e.g., "running" -> "run")
        tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

# Full Cleaning Pipeline
def full_cleaning_pipeline(df, column_name,
                           correct_typos_flag=True,
                           sep_words_flag=True,
                           rmv_single_flag=True,
                           new_lines_flag=True,
                           text_clean_flag=True):

    if correct_typos_flag:
        df[column_name] = df[column_name].apply(correct_typos)
    if sep_words_flag:
        df[column_name] = df[column_name].apply(separate_attached_words)
    if rmv_single_flag:
        df = remove_single_word_rows(df, column_name)
    if new_lines_flag:
        df[column_name] = df[column_name].apply(handle_newlines)
    if text_clean_flag:
        df[column_name] = df[column_name].apply(text_cleaning)

    return df



[nltk_data] Downloading package stopwords to /Users/rb666/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
#Sanity check for functions

def correct_typos(text):
    blob = TextBlob(text)
    print(blob)
    corrected_text = blob.correct() #automatically corrects the spelling of words in the text using a built-in method of TextBlob
    return str(corrected_text)


print(correct_typos("conputer"))


# Separating attached words
def separate_attached_words(text):
    return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", text)) #re.findall method uses a regex pattern to separate words based on capitalization rules.

print(separate_attached_words("litteraturereview"))


conputer
computer
litteraturereview


In [12]:
df_clean = full_cleaning_pipeline(reports,'reflection_answer')


Intense chaos. And then my mind checked out and my subconscious took over and started talking. What I imagine it's like looking back on life before you die.
good stuff.

a pattern of red and white lights thatflashed andbecame more intensewhen the lights flashed intensely. i alsofelt sleep for abit and i was thinkingof my partner who i could visualise
i wentback to many hard and mostly beautiful memories without prompting them at all. 
Hello
Hope as a colour
dreaming while awake- flashes of random places i have been. maybe this is what its like to be dead.

Being: immersed; calm; and thrilled.
life after retired

i dreamt myself as a harbinger of the new planet, shaping the world with life and equity, i wish to belong there with someone i really loved and missed...
relaxation, calm and curiosity

Persolly it was hard for me to find peace with the intensity of the bright light.  My eyes wouldnt stop watering so that kept my mind occupied. Still a really cool experience!

was like taking 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].apply(handle_newlines)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].apply(text_cleaning)


Unnamed: 0,reflection_answer
0,Intense chaos And mind checked unconscious too...
2,pattern red white lights thatflashed andbecame...
3,wentback many hard mostly beautiful memories w...
5,Hope colour
6,dreaming awake flashes random places maybe lik...
...,...
330,collection keyhole could see paintingthrough a...
331,Changing temperature body lights Changes inten...
333,travelling space
334,thought clot hands journey rhudian experiencin...


Compare cleaned text with original text

In [13]:
df_clean

Unnamed: 0,reflection_answer
0,Intense chaos And mind checked unconscious too...
2,pattern red white lights thatflashed andbecame...
3,wentback many hard mostly beautiful memories w...
5,Hope colour
6,dreaming awake flashes random places maybe lik...
...,...
330,collection keyhole could see paintingthrough a...
331,Changing temperature body lights Changes inten...
333,travelling space
334,thought clot hands journey rhudian experiencin...


In [14]:
reports

Unnamed: 0,reflection_answer
0,Intense chaos And then my mind checked out and...
1,good stuff
2,a pattern of red and white lights thatflashed ...
3,i wentback to many hard and mostly beautiful m...
4,Hello
...,...
331,Changing temperature of my body with the light...
332,pleasure intrigue
333,travelling through space
334,i thought clot about hands journey into rhudia...


In [16]:


# Assuming the dataframe is named df and the original file path is 'file_path'

base_name, ext = os.path.splitext(DATASET_NAME)
new_path = f"{base_name}_preprocessed{ext}"
print(new_path)

preproc_path = os.path.join(PROJDIR, f'DATA/preprocessed/{new_path}')
print(preproc_path)

# Save the cleaned DATASET column to the new CSV file
df_clean.to_csv(preproc_path, index=False)


SensoryTool_CombinedData_preprocessed.csv
/Users/rb666/projects/TopicModelling_META/DATA/preprocessed/SensoryTool_CombinedData_preprocessed.csv
