In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rickc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rickc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rickc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:

def load_data(file_path):
    return pd.read_csv(file_path)

In [4]:
def preprocess_data(data):
    # Lowercasing
    data['text'] = data['text'].apply(lambda x: x.lower())
    
    # Removing HTML tags
    data['text'] = data['text'].apply(lambda x: re.sub(r'<.*?>', '', x))
    
    # Removing URLs
    data['text'] = data['text'].apply(lambda x: re.sub(r'http[s]?://\S+', '', x))
    
    # Removing email addresses
    data['text'] = data['text'].apply(lambda x: re.sub(r'\S*@\S*\s?', '', x))
    
    # Removing punctuation and special characters
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x))
    
    # Tokenization
    data['text'] = data['text'].apply(lambda x: word_tokenize(x))
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: [word for word in x if word not in stop_words])
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Joining tokens back to string
    data['text'] = data['text'].apply(lambda x: ' '.join(x))
    
    return data

In [5]:

def split_data(data, test_size=0.2, val_size=0.25):
    # Splitting data into train and temp (temp will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=42)
    # Splitting temp_data into validation and test
    validation_data, test_data = train_test_split(temp_data, test_size=val_size, random_state=42)
    return train_data, validation_data, test_data

In [9]:
def store_splits(train_data, validation_data, test_data, dir_path='C:\CMI\Applied ML\ASS_1\data\dataset'):
    train_data.to_csv(os.path.join(dir_path, 'train.csv'), index=False)
    validation_data.to_csv(os.path.join(dir_path, 'validation.csv'), index=False)
    test_data.to_csv(os.path.join(dir_path, 'test.csv'), index=False)


In [7]:
file_path = "C:\CMI\Applied ML\ASS_1\data\dataset\emails.csv"

In [10]:
if __name__ == "__main__":
    data = load_data(file_path)
    data = preprocess_data(data)
    train_data, validation_data, test_data = split_data(data)
    store_splits(train_data, validation_data, test_data)
