In [52]:
#  -- PART 0: Import Relevant Modules 
import pandas as pd 
import string
 
from wordsegment import load, segment 

#from nltk.tokenize import sent_tokenize
#from nltk.tokenize import word_tokenize
#from nltk.corpus import stopwords


In [None]:
# -- PART 1: Read Files 
training_df = pd.read_csv("train_preprocessed.csv")
test_df = pd.read_csv("test_preprocessed.csv")

print(len(training_df), len(test_df))

Unnamed: 0,Topic,ID,Segment,IdeaUnit,label,NoteText
0,ComputerScience,6260226,1,Declarative knowledge is a factual statement.,1,BASICS OF COMPUTER SCIENCE Declarative Knowled...
1,ComputerScience,6260226,1,Imperative knowledge is solving a problem or a...,1,BASICS OF COMPUTER SCIENCE Declarative Knowled...
2,ComputerScience,6260226,1,Algorithms are instructions with steps to comp...,1,BASICS OF COMPUTER SCIENCE Declarative Knowled...


In [8]:
test_df.head(3)

Unnamed: 0,Experiment,Topic,ID,Segment,IdeaUnit,label,NoteText
0,2,ComputerScience,6260230,1,Declarative knowledge is a factual statement.,1.0,Declarative- factual statementsdeclarative say...
1,2,ComputerScience,6260230,1,Imperative knowledge is solving a problem or a...,1.0,Declarative- factual statementsdeclarative say...
2,2,ComputerScience,6260230,1,Algorithms are instructions with steps to comp...,0.0,Declarative- factual statementsdeclarative say...


In [56]:
# -- PART 2: Pre-processing and other requirements 

load() # loads wordsegment 

# returns pre-processed dataframe 
def get_cleaned_df(df, experiment_exists="No"): 

    # (a) removing data points with N/A label 
    cleaned_df = df.copy().dropna(subset=["label"])
    
    # (b) convert label to int (TK - testing data has it as a float for some reason)
    cleaned_df["label"] = cleaned_df["label"].astype(int)
    
    # (c) convert idea unit and note text to lowercase 
    cleaned_df["IdeaUnit"] = cleaned_df["IdeaUnit"].str.lower()
    cleaned_df["NoteText"] = cleaned_df["NoteText"].str.lower()

    # (d) remove punctuation 
    translator = str.maketrans("", "", string.punctuation)
    cleaned_df["IdeaUnit"] = cleaned_df["IdeaUnit"].str.translate(translator)
    cleaned_df["NoteText"] = cleaned_df["NoteText"].str.translate(translator)

    # (e) remove cases where words are merged together i.e. thisis instead of this is 
    #cleaned_df["IdeaUnit"] = cleaned_df["IdeaUnit"].apply(lambda x: " ".join(segment(x)))
    #cleaned_df["NoteText"] = cleaned_df["NoteText"].apply(lambda x: " ".join(segment(x)))

    # (f) remove whitespace [TK - unsure if necessary but doing anyway, can remove later]
    cleaned_df["IdeaUnit"] = cleaned_df["IdeaUnit"].str.split().str.join(" ")
    cleaned_df["NoteText"] = cleaned_df["NoteText"].str.split().str.join(" ")

    # (g) if column Experiment exists, then drop  
    if (experiment_exists == "Yes"): 
        cleaned_df = cleaned_df.drop("Experiment", axis=1)

    return cleaned_df  


In [57]:
#  [TK - bottleneck in segment (e), takes ~1 minute with it)
get_cleaned_df(training_df).head(3)

Unnamed: 0,Topic,ID,Segment,IdeaUnit,label,NoteText
0,ComputerScience,6260226,1,declarative knowledge is a factual statement,1,basics of computer science declarative knowled...
1,ComputerScience,6260226,1,imperative knowledge is solving a problem or a...,1,basics of computer science declarative knowled...
2,ComputerScience,6260226,1,algorithms are instructions with steps to comp...,1,basics of computer science declarative knowled...


In [58]:
#  [TK - bottleneck in segment (e), takes 20+ minutes, had to stop running)
get_cleaned_df(test_df, "Yes").head(3)

Unnamed: 0,Topic,ID,Segment,IdeaUnit,label,NoteText
0,ComputerScience,6260230,1,declarative knowledge is a factual statement,1,declarative factual statementsdeclarative says...
1,ComputerScience,6260230,1,imperative knowledge is solving a problem or a...,1,declarative factual statementsdeclarative says...
2,ComputerScience,6260230,1,algorithms are instructions with steps to comp...,0,declarative factual statementsdeclarative says...


In [None]:
# export cleaned files to csv 
training_df.to_csv("cleaned_training_data.csv", index=False)
test_df.to_csv("cleaned_test_data.csv", index=False)