# 0. Imports

In [1]:
import os

# for data handling
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# for data splitting
from sklearn.model_selection import train_test_split

# for data pre-processing
from sklearn.feature_extraction.text import TfidfVectorizer

# for saving train, valid, test data
from scipy.sparse import save_npz

# 1. Loading the data

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path, delimiter='\t', header=None, names=['label', 'text'])
    # convert labels to binary int 0/1
    df['label'] = df['label'].map({'ham':0, 'spam':1})
    return df

In [3]:
data_path = os.path.join('data', 'raw', 'smsspamcollection', 'SMSSpamCollection')
print(data_path)

data/raw/smsspamcollection/SMSSpamCollection


In [5]:
data = load_data(data_path)

In [5]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4825,4516,"Sorry, I'll call later",30
1,747,653,Please call our customer service representativ...,4


# 2. Pre-processing

In [7]:
def preprocess_data(df):
    # convert a collection of raw documents to a matrix of TF-IDF features
    
    # extract features and labels
    features = df['text'].copy()
    labels = df['label'].copy()

    # initialize the vectorizer
    TfVectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

    # transform the features
    features = TfVectorizer.fit_transform(features)
    
    # convert from scipy sparse matrix to pandas dataframe
    features = pd.DataFrame.sparse.from_spmatrix(features)

    return features, labels

In [8]:
features, labels = preprocess_data(data)

print("Features Shape:", features.shape)
print("Labels Shape:", labels.shape)

Features Shape: (5572, 8444)
Labels Shape: (5572,)


# 3. Splitting Data

In [9]:
def train_val_test_split(features, labels, random_state=None):
    # splitting into train, val, test
    
    # split into test and non-test
    X_non_test, X_test, y_non_test, y_test = train_test_split(features, labels, test_size=0.15, random_state=random_state)

    # split into train and val
    X_train, X_val, y_train, y_val = train_test_split(X_non_test, y_non_test, test_size=0.2, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [10]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(features, labels, random_state=42)

print("Training Data Shape:", X_train.shape)
print("Validation Data Shape:", X_val.shape)
print("Test Data Shape:", X_test.shape)

Training Data Shape: (3788, 8444)
Validation Data Shape: (948, 8444)
Test Data Shape: (836, 8444)


# 4. Saving Data

In [11]:
data_train_save_path = os.path.join('data','train.csv')
data_val_save_path = os.path.join('data','val.csv')
data_test_save_path = os.path.join('data','test.csv')

train_val_test_save_paths = [data_train_save_path, data_val_save_path, data_test_save_path]

train_val_test_save_paths

['data/train.csv', 'data/val.csv', 'data/test.csv']

In [12]:
def save_train_val_test_data(features, labels, train_val_test_save_paths, random_state):
    # extract train, test, val
    X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(features, labels, random_state=random_state)
    
    # save train, val, test data as .csv files
    train_data = pd.concat([X_train, y_train], axis=1)
    val_data = pd.concat([X_val, y_val], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)

    # save as .csv files to the savepaths
    train_data.to_csv(train_val_test_save_paths[0], index=False)
    val_data.to_csv(train_val_test_save_paths[1], index=False)
    test_data.to_csv(train_val_test_save_paths[2], index=False)

    print("Train, Val, Test data saved to:\n", train_val_test_save_paths)
    
    return None

In [13]:
save_train_val_test_data(features, labels, train_val_test_save_paths, random_state=42)

  train_data.to_csv(train_val_test_save_paths[0], index=False)
  val_data.to_csv(train_val_test_save_paths[1], index=False)
  test_data.to_csv(train_val_test_save_paths[2], index=False)


Train, Val, Test data saved to:
 ['data/train.csv', 'data/val.csv', 'data/test.csv']
