# Applied Machine Learning - Assignment 2
##### Submitted by 
- Anusha R
- MDS202212
- anushar@cmi.ac.in

In [34]:
# Importing necessary libraries

import pandas as pd
import re
import numpy as np
from tabulate import tabulate

import os
import warnings


from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [30]:
warnings.filterwarnings("ignore")

In [4]:
current_directory = os.getcwd()

In [5]:
# load the data from a given file path

def load_data(file_path):
    return pd.read_csv(file_path)

In [6]:
# preprocess the data

def preprocess_text(text):
    text = text.lower()
    text_cleaned = re.sub(r'[^a-zA-Z]', ' ', text)
    return text_cleaned

def preprocess_data(df):
    df['text'] = df['text'].apply(lambda x: preprocess_text(x))
    return df

In [20]:
# split the data into train/validation/test 

def split_data(df, test_size=0.2, validation_size=0.25, random_state=42):
    train, test = train_test_split(df, test_size=test_size, 
                                   random_state=random_state)
    
    train, validation = train_test_split(train, test_size=validation_size, 
                                         random_state=random_state)
    
    return train, validation, test

In [8]:
# store the splits at train.csv/validation.csv/test.csv

def save_data(train, validation, test, train_path='train.csv', validation_path='validation.csv', test_path='test.csv'):
    train.to_csv(train_path, index=False)
    validation.to_csv(validation_path, index=False)
    test.to_csv(test_path, index=False)

# Preprocess the Data


In [9]:
# Load the data

file_path = os.path.join(current_directory, 'Dataset/emails.csv')
data = load_data(file_path)

In [10]:
preprocessed_data = preprocess_data(data) # Preprocess the data

train_data, validation_data, test_data = split_data(preprocessed_data) # Split the data as train/validation/test

In [11]:
save_data(train_data, validation_data, test_data) # Save data

## Implementing GIT

In [12]:
!dvc init --no-scm --f
!git init

Initialized DVC repository.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>
Initialized empty Git repository in D:/CMI DS/Sem 4/AML/Assignment2/.git/


In [13]:
# Track data files with DVC

!git add "D:\CMI DS\Sem 4\AML\Assignment2\Dataset\emails.csv" train.csv validation.csv test.csv  



In [14]:
!git commit -m "Added raw data, train, test and validation data after splitting"

[master (root-commit) c3b587f] Added raw data, train, test and validation data after splitting
 4 files changed, 11460 insertions(+)
 create mode 100644 Dataset/emails.csv
 create mode 100644 test.csv
 create mode 100644 train.csv
 create mode 100644 validation.csv


In [15]:
!git status

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/
	.dvcignore
	.ipynb_checkpoints/
	prepare.ipynb
	train.ipynb

nothing added to commit but untracked files present (use "git add" to track)


## Split the data again with different random seed

In [21]:
train_data, validation_data, test_data = split_data(preprocessed_data, random_state=1) # Split the data as train/validation/test

In [22]:
save_data(train_data, validation_data, test_data) # Save data

In [23]:
!git status

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   test.csv
	modified:   train.csv
	modified:   validation.csv

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/
	.dvcignore
	.ipynb_checkpoints/
	prepare.ipynb
	train.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


In [24]:
!git add "D:\CMI DS\Sem 4\AML\Assignment2\Dataset\emails.csv" train.csv validation.csv test.csv  

In [25]:
!git commit -m "Updated dataset for random state 1"

[master fe49dbf] Updated dataset for random state 1
 3 files changed, 5596 insertions(+), 5596 deletions(-)


In [26]:
!git log --oneline --all

fe49dbf Updated dataset for random state 1
c3b587f Added raw data, train, test and validation data after splitting


## Distribution of target variable for initial split

In [31]:
!git checkout c3b587f

HEAD is now at c3b587f Added raw data, train, test and validation data after splitting


In [32]:
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

In [35]:
data = [
    ["Training data", sum(train_data['spam'] == 0), sum(train_data['spam'] == 1)],
    ["Validation data", sum(validation_data['spam'] == 0), sum(validation_data['spam'] == 1)],
    ["Test data", sum(test_data['spam'] == 0), sum(test_data['spam'] == 1)]
]

print(tabulate(data, headers=["Dataset", "0s", "1s"], tablefmt="grid"))

+-----------------+------+------+
| Dataset         |   0s |   1s |
| Training data   | 2616 |  820 |
+-----------------+------+------+
| Validation data |  872 |  274 |
+-----------------+------+------+
| Test data       |  872 |  274 |
+-----------------+------+------+


In [36]:
!git checkout fe49dbf 

Previous HEAD position was c3b587f Added raw data, train, test and validation data after splitting
HEAD is now at fe49dbf Updated dataset for random state 1


In [37]:
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

In [38]:
data = [
    ["Training data", sum(train_data['spam'] == 0), sum(train_data['spam'] == 1)],
    ["Validation data", sum(validation_data['spam'] == 0), sum(validation_data['spam'] == 1)],
    ["Test data", sum(test_data['spam'] == 0), sum(test_data['spam'] == 1)]
]

print(tabulate(data, headers=["Dataset", "0s", "1s"], tablefmt="grid"))

+-----------------+------+------+
| Dataset         |   0s |   1s |
| Training data   | 2624 |  812 |
+-----------------+------+------+
| Validation data |  860 |  286 |
+-----------------+------+------+
| Test data       |  876 |  270 |
+-----------------+------+------+
