In [1]:
import os
import pandas as pd
import urllib.request
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define project paths
project_path = "/content/drive/MyDrive/dvc_project"
data_path = os.path.join(project_path, "data")
os.makedirs(data_path, exist_ok=True)

# Download the SMS Spam Collection dataset
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
dataset_path = os.path.join(data_path, "smsspamcollection.zip")

if not os.path.exists(dataset_path):
    print("Downloading dataset...")
    urllib.request.urlretrieve(dataset_url, dataset_path)
    print("Download complete.")
else:
    print("Dataset already exists.")

# Extract dataset
import zipfile

with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(data_path)

# Convert dataset to CSV format
txt_file_path = os.path.join(data_path, "SMSSpamCollection")
raw_data_path = os.path.join(data_path, "raw_data.csv")

if os.path.exists(txt_file_path):
    print("Converting to CSV format...")
    df = pd.read_csv(txt_file_path, sep='\t', names=["label", "text"], encoding='latin-1')

    # Convert labels to binary (spam = 1, ham = 0)
    df["label"] = df["label"].map({"ham": 0, "spam": 1})

    # Save as CSV
    df.to_csv(raw_data_path, index=False)
    print(f"Dataset saved as: {raw_data_path}")
else:
    print("SMSSpamCollection file not found!")


Mounted at /content/drive
Downloading dataset...
Download complete.
Converting to CSV format...
Dataset saved as: /content/drive/MyDrive/dvc_project/data/raw_data.csv


In [2]:
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv(raw_data_path)

# Train-validation-test split
train, temp = train_test_split(df, test_size=0.4, random_state=42)
valid, test = train_test_split(temp, test_size=0.5, random_state=42)

# Save data splits
train.to_csv(os.path.join(data_path, "train.csv"), index=False)
valid.to_csv(os.path.join(data_path, "validation.csv"), index=False)
test.to_csv(os.path.join(data_path, "test.csv"), index=False)

print("Data split and saved.")

# Track with DVC
!pip install dvc[gdrive] -q
%cd /content/drive/MyDrive/dvc_project
!dvc init
!dvc add data/raw_data.csv data/train.csv data/validation.csv data/test.csv
!git add data/*.dvc .gitignore
!git commit -m "Tracked dataset versions with DVC"
!dvc push


Data split and saved.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.2/77.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.0/426.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.3/201.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m 

In [3]:
import os

data_files = os.listdir("/content/drive/MyDrive/dvc_project/data")
print("Files in data folder:", data_files)


Files in data folder: ['raw_data.csv.dvc', 'raw_data.csv', 'smsspamcollection.zip', 'SMSSpamCollection', 'readme', 'train.csv', 'validation.csv', 'test.csv', 'train.csv.dvc', 'validation.csv.dvc', 'test.csv.dvc']
