In [1]:
from src.preprocess import transform
import pandas as pd

# Preprocess bigdata2023classification datasets 

**Preprocessing Steps:**
* **HTML Tag Removal**: Strips HTML tags from the text.
* **URL Removal**: Cleans out URLs, removing both "http://" and "www" prefixed links to avoid their interference with text analysis.
* **Number Removal**: Deletes numerical values from the text, focusing the analysis on textual content.
* **Punctuation Removal**: Eliminates punctuation marks, leaving only words and whitespace, which helps in standardizing the text for tokenization.
* **Case Normalization**: Converts all text to lowercase to ensure consistency and to prevent the same words in different cases from being counted as distinct.
* **Stopwords Removal**: Filters out common English stopwords (e.g., "the", "is", "in"), which are generally considered irrelevant for many analysis tasks.
* **Lemmatization**: Transforms words into their base or dictionary form (lemmas), accounting for their part-of-speech. This step aims to consolidate different inflected forms of a word, treating them as a single item.
* **Removal of Single-Character Words**: Deletes words that consist of a single character, as they typically do not carry meaningful information for analysis.

Set paths where the train, test datasets are located and paths to where the preprocessed train,test datasets will be saved 

In [None]:
PATH_TO_TRAIN_DATA = "bigdata2023classification/data/train.csv"
PATH_TO_TEST_DATA = "bigdata2023classification/data/test_without_labels.csv"

# Path where the data will be saved after preprocessing and extracting features
PATH_TO_PREPROCESSED_TRAIN_DATA = "bigdata2023classification/preprocessed_data/preprocessed_train_df.pkl"
PATH_TO_PREPROCESSED_TEST_DATA = "bigdata2023classification/preprocessed_data/preprocessed_test_df.pkl"

In [3]:
df = pd.read_csv(PATH_TO_TRAIN_DATA)
df = transform(df, ["Title", "Content"])
df.to_pickle(PATH_TO_PREPROCESSED_TRAIN_DATA)

HTML clean done
Numbers clean done
URLs clean done
Punctation clean done
Uppercase clean done
Tokenize done
Stopwords clean done
Lemmatize done
Clean single char words done


In [4]:
test_df = pd.read_csv(PATH_TO_TEST_DATA)
test_df = transform(test_df, ["Title", "Content"])
test_df.to_pickle(PATH_TO_PREPROCESSED_TEST_DATA)

HTML clean done
Numbers clean done
URLs clean done
Punctation clean done
Uppercase clean done


Tokenize done
Stopwords clean done
Lemmatize done
Clean single char words done
