# 0. Imports

In [2]:
# load the appropriate pip env
! source ~/Desktop/Coding/environments/pip_envs/base_env/bin/activate

In [12]:
# install dvc
! pip install dvc



In [14]:
# initialise git and dvc
! git init
! dvc init

Initialized empty Git repository in /Users/sampadk04/Desktop/Coding/Active_Projects/CMI_Coding_Assignments/Applied_ML_Assignments/assign2_sms_spam_detection/.git/
Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mht

### Storing Data Remotely in Google Drive using `dvc`

We configure dvc to store the data remotely at google drive folder. Here is the [link](https://drive.google.com/drive/folders/1w3-1WhdOSNtvwucAqwmLAG4ZGegI2Y89?usp=sharing) to this folder. The folder id of this gdrive folder is:

`1w3-1WhdOSNtvwucAqwmLAG4ZGegI2Y89`

In [53]:
# for google drive support
! pip install 'dvc[gdrive]'

Collecting dvc-gdrive==2.19.1
  Downloading dvc_gdrive-2.19.1-py3-none-any.whl (11 kB)
Collecting pydrive2[fsspec]>=1.15.0
  Downloading PyDrive2-1.15.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting oauth2client>=4.0.0
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyOpenSSL>=19.1.0
  Downloading pyOpenSSL-23.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.3/57.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting google-api-python-client>=1.12.5
  Downloading google_api_python_client-2.79.0-py2.py3-none-any.whl (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01

In [48]:
! dvc remote add -d storage gdrive://1w3-1WhdOSNtvwucAqwmLAG4ZGegI2Y89
! git add .dvc/config
! git commit -m "Configure remote storage to google drive"

Setting 'storage' as a default remote.
[0mOn branch main
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.gitignore[m
	[31mdatabase/[m
	[31mprepare.ipynb[m
	[31mtrain.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)


In [54]:
# push changes to config files
! dvc push

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

Authentication successful.
  0% Transferring|                                   |0/3 [00:00<?,     ?file/s]
![A
  0%|          |/Users/sampadk04/Desktop/Coding/Act0.00/? [00:00<?,        ?B/s][A
  0%|          |/Users/sampadk04/Desktop/Coding0.00/6.00k [00:00<?,        ?B/s][A
 33% Transferring|██████████▎                    |1/3 [00:03<00:06,  3.28s/file][A
                                                                                [A
![A
  0%|          |/Users/sampadk04/Desktop/Coding/Act0.00/? [00:00<?,        ?B/s][A
  0%|          |/Users/sampadk04/Desktop/Coding0.00/5.73k [00:00<?,        

In [4]:
import os

# for data handling
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# for data splitting
from sklearn.model_selection import train_test_split

# for data pre-processing
from sklearn.feature_extraction.text import TfidfVectorizer

# for saving train, valid, test data
from scipy.sparse import save_npz

# 1. Loading the data

In [5]:
def load_data(file_path):
    df = pd.read_csv(file_path, delimiter='\t', header=None, names=['label', 'text'])
    # convert labels to binary int 0/1
    df['label'] = df['label'].map({'ham':0, 'spam':1})
    return df

In [15]:
# check data folder
! ls data

[34mraw[m[m       test.csv  train.csv val.csv


In [18]:
# start tracking the raw data folder using dvc

! dvc add data/raw
! git add data/raw.dvc

! git add data/.gitignore

! git commit -m "started tracking raw data"

[?25l                                                                          [32m⠋[0m Checking graph
Adding...                                                                       
![A
Building data objects from data/raw                   |0.00 [00:00,      ?obj/s][A
                                                                                [A
![A
Building data objects from data/raw                   |0.00 [00:00,      ?obj/s][A
                                                                                [A
![A
  0% Checking cache in '/Users/sampadk04/Desktop/Coding/Active_Projects/CMI_Codi[A
                                                                                [A
![A
Building data objects from data/raw                   |0.00 [00:00,      ?obj/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 50.19file/s][A

To track the changes with git, run:

	git add data/raw.dvc

To enable auto staging, run:

	dvc config core.autostage tru

In [6]:
data_path = os.path.join('data', 'raw', 'smsspamcollection', 'SMSSpamCollection')
print(data_path)

data/raw/smsspamcollection/SMSSpamCollection


In [7]:
data = load_data(data_path)

In [8]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
data.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4825,4516,"Sorry, I'll call later",30
1,747,653,Please call our customer service representativ...,4


# 2. Pre-processing

In [20]:
def preprocess_data(df):
    # convert a collection of raw documents to a matrix of TF-IDF features
    
    # extract features and labels
    features = df['text'].copy()
    labels = df['label'].copy()

    # initialize the vectorizer
    TfVectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

    # transform the features
    features = TfVectorizer.fit_transform(features)
    
    # convert from scipy sparse matrix to pandas dataframe
    features = pd.DataFrame.sparse.from_spmatrix(features)

    return features, labels

In [21]:
features, labels = preprocess_data(data)

print("Features Shape:", features.shape)
print("Labels Shape:", labels.shape)

Features Shape: (5572, 8444)
Labels Shape: (5572,)


# 3. Splitting Data

In [22]:
def train_val_test_split(features, labels, random_state=None):
    # splitting into train, val, test
    
    # split into test and non-test
    X_non_test, X_test, y_non_test, y_test = train_test_split(features, labels, test_size=0.15, random_state=random_state)

    # split into train and val
    X_train, X_val, y_train, y_val = train_test_split(X_non_test, y_non_test, test_size=0.2, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test

# 4. Saving Data

In [24]:
data_train_save_path = os.path.join('data','train.csv')
data_val_save_path = os.path.join('data','val.csv')
data_test_save_path = os.path.join('data','test.csv')

train_val_test_save_paths = [data_train_save_path, data_val_save_path, data_test_save_path]

train_val_test_save_paths

['data/train.csv', 'data/val.csv', 'data/test.csv']

In [25]:
def save_train_val_test_data(features, labels, train_val_test_save_paths, random_state):
    # extract train, test, val
    X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(features, labels, random_state=random_state)
    
    # save train, val, test data as .csv files
    train_data = pd.concat([X_train, y_train], axis=1)
    val_data = pd.concat([X_val, y_val], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)

    # save as .csv files to the savepaths
    train_data.to_csv(train_val_test_save_paths[0], index=False)
    val_data.to_csv(train_val_test_save_paths[1], index=False)
    test_data.to_csv(train_val_test_save_paths[2], index=False)

    print("Train, Val, Test data saved to:\n", train_val_test_save_paths)
    
    return None

## 4.1 Tracking changes with `random_state=42`

In [26]:
save_train_val_test_data(features, labels, train_val_test_save_paths, random_state=42)

  train_data.to_csv(train_val_test_save_paths[0], index=False)
  val_data.to_csv(train_val_test_save_paths[1], index=False)
  test_data.to_csv(train_val_test_save_paths[2], index=False)


Train, Val, Test data saved to:
 ['data/train.csv', 'data/val.csv', 'data/test.csv']


In [41]:
# check the data_profile before changing the seeds

random_state = 42

X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(features, labels, random_state=random_state)

print("Data Profile for Random State =", random_state)
print("")

print('-'*40)
print("Training Data Details:")
print("Training Data Shape:", X_train.shape)
print(y_train.value_counts())

print('-'*40)
print("Validation Data Details:")
print("Validation Data Shape:", X_val.shape)
print(y_val.value_counts())

print('-'*40)
print("Test Data Details:")
print("Test Data Shape:", X_test.shape)
print(y_test.value_counts())


Data Profile for Random State = 42

----------------------------------------
Training Data Details:
Training Data Shape: (3788, 8444)
0    3265
1     523
Name: label, dtype: int64
----------------------------------------
Validation Data Details:
Validation Data Shape: (948, 8444)
0    831
1    117
Name: label, dtype: int64
----------------------------------------
Test Data Details:
Test Data Shape: (836, 8444)
0    729
1    107
Name: label, dtype: int64


In [49]:
# track the data on dvc

# track train, val, test .csv files
! dvc add data/train.csv
! dvc add data/val.csv
! dvc add data/test.csv

! git add data/train.csv.dvc
! git add data/val.csv.dvc
! git add data/test.csv.dvc

! git add data/.gitignore

! git commit -m "Started tracking .csv files"

[?25l                                                                          [32m⠋[0m Checking graph
Adding...                                                                       
![A
  0% Checking cache in '/Users/sampadk04/Desktop/Coding/Active_Projects/CMI_Codi[A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 53.61file/s][A

To track the changes with git, run:

	git add data/train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[?25l[32m⠋[0m Checking graph                                                 
Adding...                                                                       
![A
  0% Checking cache in '/Users/sampadk04/Desktop/Coding/Active_Projects/CMI_Codi[A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 68.87file/s][A

To track the changes with git, run:

	git add data/val.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[?25l                                           

## 4.2 Tracking changes with `random_state=420`

In [55]:
save_train_val_test_data(features, labels, train_val_test_save_paths, random_state=420)

  train_data.to_csv(train_val_test_save_paths[0], index=False)
  val_data.to_csv(train_val_test_save_paths[1], index=False)
  test_data.to_csv(train_val_test_save_paths[2], index=False)


Train, Val, Test data saved to:
 ['data/train.csv', 'data/val.csv', 'data/test.csv']


In [56]:
# check the data_profile before changing the seeds

random_state = 420

X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(features, labels, random_state=random_state)

print("Data Profile for Random State =", random_state)
print("")

print('-'*40)
print("Training Data Details:")
print("Training Data Shape:", X_train.shape)
print(y_train.value_counts())

print('-'*40)
print("Validation Data Details:")
print("Validation Data Shape:", X_val.shape)
print(y_val.value_counts())

print('-'*40)
print("Test Data Details:")
print("Test Data Shape:", X_test.shape)
print(y_test.value_counts())


Data Profile for Random State = 420

----------------------------------------
Training Data Details:
Training Data Shape: (3788, 8444)
0    3286
1     502
Name: label, dtype: int64
----------------------------------------
Validation Data Details:
Validation Data Shape: (948, 8444)
0    818
1    130
Name: label, dtype: int64
----------------------------------------
Test Data Details:
Test Data Shape: (836, 8444)
0    721
1    115
Name: label, dtype: int64


In [57]:
# track the data on dvc

# track train, val, test .csv files
! dvc add data/train.csv
! dvc add data/val.csv
! dvc add data/test.csv

! git add data/train.csv.dvc
! git add data/val.csv.dvc
! git add data/test.csv.dvc

! git add data/.gitignore

! git commit -m "Started tracking .csv files"

[?25l                                                                          [32m⠋[0m Checking graph
Adding...                                                                       
![A
  0% Checking cache in '/Users/sampadk04/Desktop/Coding/Active_Projects/CMI_Codi[A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  4.68file/s][A

To track the changes with git, run:

	git add data/train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[?25l[32m⠋[0m Checking graph                                                 
Adding...                                                                       
![A
  0% Checking cache in '/Users/sampadk04/Desktop/Coding/Active_Projects/CMI_Codi[A
   

In [58]:
! dvc push

  0% Transferring|                                   |0/3 [00:00<?,     ?file/s]
![A
  0%|          |/Users/sampadk04/Desktop/Coding/Act0.00/? [00:00<?,        ?B/s][A
  0%|          |/Users/sampadk04/Desktop/Coding0.00/61.5M [00:00<?,        ?B/s][A
  0%|          |/Users/sampadk04/Desktop8.00k/61.5M [00:01<4:22:08,    4.10kB/s][A
  1%|          |/Users/sampadk04/Desktop/Co440k/61.5M [00:02<04:22,     244kB/s][A
  1%|          |/Users/sampadk04/Desktop/Co584k/61.5M [00:02<03:12,     333kB/s][A
  1%|          |/Users/sampadk04/Desktop/Co768k/61.5M [00:02<02:13,     477kB/s][A
  2%|▏         |/Users/sampadk04/Desktop/Co952k/61.5M [00:02<01:39,     642kB/s][A
  2%|▏         |/Users/sampadk04/Desktop/C1.12M/61.5M [00:02<01:16,     831kB/s][A
  2%|▏         |/Users/sampadk04/Desktop/C1.30M/61.5M [00:02<01:01,    1.02MB/s][A
  2%|▏         |/Users/sampadk04/Desktop/C1.48M/61.5M [00:03<00:53,    1.18MB/s][A
  3%|▎         |/Users/sampadk04/Desktop/C1.67M/61.5M [00:03<00:46,    1.3

In [59]:
! git status

On branch main
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.gitignore[m
	[31mdatabase/[m
	[31mprepare.ipynb[m
	[31mtrain.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)


In [61]:
! git add .gitignore
! git add database/
! git add prepare.ipynb
! git add train.ipynb

! git commit -m "Added Python Notebooks"

[main 4d19257] Added Python Notebooks
 4 files changed, 4056 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 database/mlflow.db
 create mode 100644 prepare.ipynb
 create mode 100644 train.ipynb
