In [1]:
#Download the fraud dataset, preprocess it and save three files train_a.csv, train_b.csv and test.csv
#Train a model with train_a.csv, test it on test.csv and save the model and metrics
#Train a model with train_b.csv, test it on test.csv and save the model and metrics
#Switch back to the model created with train_a.csv, and verify that its metrics on test match the computed metrics

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('credit_filtered.csv')

In [4]:
df = df.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'], axis=1)

In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
2,1,CASH_OUT,344464.4,C793293778,0.0,0.0,C766572210,1133312.56,0.0,0,0
3,1,CASH_OUT,57279.11,C1800649922,0.0,0.0,C824009085,127206.9,64106.18,0,0
4,1,CASH_OUT,71991.42,C990679445,0.0,0.0,C557041912,81682.58,557537.26,0,0


In [6]:
length = len(df)
split_size = length // 3
dfs = [df.iloc[i * split_size:(i + 1) * split_size] for i in range(3)]
for i, df_split in enumerate(dfs, start=1):
    df_split.to_csv(f'split_{i}.csv', index=False)

In [7]:
#Read split_1.csv, build a simple linear regression model and test on split_3.csv
data = pd.read_csv('split_1.csv')

In [8]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']),
        ('cat', OneHotEncoder(), ['type'])
    ])
preprocessor.fit(data.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']))

In [9]:
model_input = preprocessor.transform(data.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']))

In [10]:
target = data['isFraud']

In [11]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')
reg.fit(model_input, target)

In [12]:
test = pd.read_csv('split_3.csv')

In [13]:
test_input = preprocessor.transform(test.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']))

In [14]:
test_target = test['isFraud']

In [15]:
test_predict = reg.predict(test_input)

In [16]:
from sklearn.metrics import accuracy_score
print('Accuracy with split_1 : ', accuracy_score(test_target, test_predict))

Accuracy with split_1 :  0.9177638587072308


In [17]:
import pickle 
with open('model.p', 'wb') as f:
    pickle.dump(reg, f)

In [18]:
#Commit the data and model to DVC
!git init
!dvc init
!dvc add split_1.csv

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /Users/prithvin/Documents/USFCA/MSDS626/DV/.git/
Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m

In [19]:
!dvc add model.p split_3.csv

[?25l                                                                          [32m⠋[0m Checking graph
  0% Adding...|                                      |0/2 [00:00<?,     ?file/s]
![A
  0% Checking cache in '/Users/prithvin/Documents/USFCA/MSDS626/DV/.dvc/cache'| [A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0% Checking cache in '/Users/prithvin/Documents/USFCA/MSDS626/DV/.dvc/cache'| [A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
100% Adding...|██████████████████████

In [20]:
!git add split_1.csv.dvc split_3.csv.dvc model.p.dvc .gitignore

In [21]:
!git commit -m "First model. Trained using split_1.csv"

[master (root-commit) 6147ea0] First model. Trained using split_1.csv
 Committer: Prithvi Nuthanakalva <prithvin@Prithvis-MacBook-Air.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 7 files changed, 21 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 model.p.dvc
 create mode 100644 split_1.csv.dvc
 create mode 100644 split_3.csv.dvc


In [22]:
!git tag -a "v1.0" -m "First model. Trained using split_1.csv"

In [23]:
#Repeat the same with the second model

In [24]:
data = pd.read_csv('split_2.csv')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']),
        ('cat', OneHotEncoder(), ['type'])
    ])
preprocessor.fit(data.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']))
model_input = preprocessor.transform(data.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']))
target = data['isFraud']
reg = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')
reg.fit(model_input, target)
test = pd.read_csv('split_3.csv')
test_input = preprocessor.transform(test.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']))
test_target = test['isFraud']
test_predict = reg.predict(test_input)
print('Accuracy with split_2 : ', accuracy_score(test_target, test_predict))

Accuracy with split_2 :  0.9063871490050706


In [25]:
with open('model.p', 'wb') as f:
    pickle.dump(reg, f)

In [26]:
!dvc add split_2.csv model.p

[?25l                                                                          [32m⠋[0m Checking graph
  0% Adding...|                                      |0/2 [00:00<?,     ?file/s]
![A
  0% Checking cache in '/Users/prithvin/Documents/USFCA/MSDS626/DV/.dvc/cache'| [A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0% Checking cache in '/Users/prithvin/Documents/USFCA/MSDS626/DV/.dvc/cache'| [A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
100% Adding...|██████████████████████

In [27]:
!git add split_2.csv.dvc model.p.dvc .gitignore

In [28]:
!git commit -m "Second model. Trained using split_2.csv"

[master fa853c0] Second model. Trained using split_2.csv
 Committer: Prithvi Nuthanakalva <prithvin@Prithvis-MacBook-Air.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 3 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 split_2.csv.dvc


In [29]:
!git checkout v1.0

Note: switching to 'v1.0'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 6147ea0 First model. Trained using split_1.csv


In [30]:
!dvc checkout

[33mM[0m       model.p                                                        
[31mD[0m       split_2.csv
[0m

In [31]:
with open('model.p', 'rb') as f:
    reg = pickle.load(f)

In [33]:
test = pd.read_csv('split_3.csv')
test_input = preprocessor.transform(test.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']))
test_target = test['isFraud']
test_predict = reg.predict(test_input)
print('Accuracy: ', accuracy_score(test_target, test_predict))

Accuracy:  0.9308023349950999
