In [1]:
from mysklearn import myclassifiers, myevaluation, mypytable, myutils
import importlib

In [2]:
importlib.reload(mypytable)
# first we are going to import the dataset into a mypytable object
mytable = mypytable.MyPyTable()
mytable.load_from_file("input_data/Fraud_chop.csv")

print(mytable.column_names)

print(mytable.data[0])

# we know from my datachoping notebook what each column is and does
# mytable.drop_cols(['step','type','nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud'])
mytable.drop_cols(['step','nameOrig', 'nameDest', 'isFlaggedFraud']) # we dont need step, nameOrig, nameDest, isFlaggedFraud
# this is because:
# step: this is just the time from the start of the data collection
# nameOrig: this is the name of the person who sent the money
# nameDest: this is the name of the person who received the money
# isFlaggedFraud: this is the classification results for the AI that the group that collected the data created

print(mytable.data[0])
print(mytable.column_names)

['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']
[241.0, 'CASH_OUT', 325470.07, 'C570536992', 325470.07, 0.0, 'C437423112', 19771.15, 345241.22, 1.0, 0.0]
['CASH_OUT', 325470.07, 325470.07, 0.0, 19771.15, 345241.22, 1.0]
['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud']


In [3]:
# the values of type are strings, so we will convert them to ints to be able to be used in the classifiers
mytable.convert_col_to_int('type')

['TRANSFER', 'PAYMENT', 'DEBIT', 'CASH_OUT', 'CASH_IN']
['CASH_OUT', 'CASH_IN', 'TRANSFER', 'DEBIT', 'PAYMENT']


In [4]:
data = mytable.data
headers = mytable.column_names

# we also will make x and y
X = []
y = []
for row in data:
    X.append(row[0:len(row)-1])
    y.append(row[-1])

In [5]:
X_train_folds_indexes, X_test_folds_indexes = myevaluation.kfold_cross_validation(X,13)

X_test_folds,X_train_folds,y_test_folds,y_train_folds = myutils.indexes_to_fold(X_test_folds_indexes, X_train_folds_indexes, X, y)
X_test,X_train,y_test,y_train = myutils.folds_to_train_test(X_test_folds,X_train_folds,y_test_folds,y_train_folds)

In [6]:
dummy_clf = myclassifiers.MyDummyClassifier()
dummy_clf.fit(X_train, y_train)
dummy_Y_predicted = dummy_clf.predict(X_test)

dummy_accuracy = myevaluation.accuracy_score(y_test, dummy_Y_predicted)
dummy_BinaryF1 = myevaluation.binary_f1_score(y_test, dummy_Y_predicted)
dummy_Binary_precision = myevaluation.binary_precision_score(y_test, dummy_Y_predicted)
dummy_Binary_recall = myevaluation.binary_recall_score(y_test, dummy_Y_predicted)

print("Dummy accuracy:", dummy_accuracy)
print("Dummy Binary F1:", dummy_BinaryF1)
print("Dummy Binary precision:", dummy_Binary_precision)
print("Dummy Binary recall:", dummy_Binary_recall)

Dummy accuracy: 0.5
Dummy Binary F1: 0
Dummy Binary precision: 0
Dummy Binary recall: 0


In [7]:
NaiveBayes_clf = myclassifiers.MyNaiveBayesClassifier()
NaiveBayes_clf.fit(X_train, y_train)
NaiveBayes_Y_predicted = NaiveBayes_clf.predict(X_test)

NB_accuracy = myevaluation.accuracy_score(y_test, NaiveBayes_Y_predicted, normalize=True)
NB_BinaryF1 = myevaluation.binary_f1_score(y_test, NaiveBayes_Y_predicted)
NB_Binary_precision = myevaluation.binary_precision_score(y_test, NaiveBayes_Y_predicted)
NB_Binary_recall = myevaluation.binary_recall_score(y_test, NaiveBayes_Y_predicted)

print("Naive Bayes accuracy:", NB_accuracy)
print("Naive Bayes Binary F1:", NB_BinaryF1)
print("Naive Bayes Binary precision:", NB_Binary_precision)
print("Naive Bayes Binary recall:", NB_Binary_recall)

Naive Bayes accuracy: 0.5
Naive Bayes Binary F1: 0
Naive Bayes Binary precision: 0
Naive Bayes Binary recall: 0


In [8]:
reg_clf = myclassifiers.MySimpleLinearRegressor()
reg_clf.fit(X_train, y_train)
reg_y_predicted = reg_clf.predict(X_test)

reg_y_predicted_rounded = []
for val in reg_y_predicted:
    reg_y_predicted_rounded.append(round(val))

reg_accuracy = myevaluation.accuracy_score(y_test, reg_y_predicted_rounded)
reg_BinaryF1 = myevaluation.binary_f1_score(y_test, reg_y_predicted_rounded)
reg_Binary_precision = myevaluation.binary_precision_score(y_test, reg_y_predicted_rounded)
reg_Binary_recall = myevaluation.binary_recall_score(y_test, reg_y_predicted_rounded)

print("Linear Regressor accuracy:", reg_accuracy)
print("Linear Regressor Binary F1:", reg_BinaryF1)
print("Linear Regressor Binary precision:", reg_Binary_precision)
print("Linear Regressor Binary recall:", reg_Binary_recall)

Linear Regressor accuracy: 0.47320341047503045
Linear Regressor Binary F1: 0.4444444444444444
Linear Regressor Binary precision: 0.4701086956521739
Linear Regressor Binary recall: 0.42143727161997563


## Results:
* Out of all the results the Linear Regressor had highest score in all 4 categories:
    1. Accuracy
    1. Binary F1
    1. Binary Precision
    1. Binary Recall
* because of this, we will be using the Linear Regressor for the Heroku App

In [9]:
n = 1000
m = 15
f = 4

forest_clf = myclassifiers.MyRandomForestClassifier(random_state=100)
forest_clf.fit(X, y, n, m, f)
y_predicted = forest_clf.predict(X_test)
accuracy = myevaluation.accuracy_score(y_test, y_predicted)
print("Accuracy:", accuracy)

Accuracy: 0.7813641900121803
