## Combine the multiple files into one big CSV since we could not load a large file to GitHub. 

In [None]:
! ./combine.sh

### Read in a set of data and examine it

In [None]:
import pandas as pd
df = pd.read_csv('CreditCardFraud.csv')


In [None]:
print(df.shape, df.columns)
train_size = .3
test_size = .1

display(df.head())
print(df.isFraud.value_counts())
print(df.type.value_counts())


### Keep the columns we want and change the type to code numbers instead

In [None]:
columns = ['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud', 'isFraud']
df = df[columns]
df.type = pd.Categorical(df.type).codes
print(df.shape, df.columns)
display(df.head())


### Prepare train & test sets with desired columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
dfNB = df
trainNB_X, testNB_X, trainNB_Y, testNB_Y = train_test_split(dfNB[dfNB.columns[:-1]], dfNB.isFraud, \
                                        train_size = train_size, test_size = test_size, random_state = 1)
print(testNB_Y.value_counts())
print(trainNB_Y.value_counts()/trainNB_Y.count())
print(testNB_Y.value_counts()/testNB_Y.count())
display(trainNB_X.head(10))

## Create a Naive Bayes model

In [None]:
from sklearn.naive_bayes import GaussianNB
modelNB = GaussianNB()
modelNB.fit(trainNB_X, trainNB_Y)

### Examine the results of Naive Bayes

In [None]:
def evaluate_predictions(test, pred, show_percent = True):
    from sklearn.metrics import confusion_matrix
    length = len(test)
    print(f'Test length = {length}')
    print('\nTest Values')
    print(test.value_counts())
    print('\nPredicted Values')
    #print(pred, type(pred))
    print(pd.value_counts(pred))
    print('\n TP FP\n FN TN')
    cm = confusion_matrix(test, pred)
    print(cm)

    if show_percent:
        import numpy as np
        print('\n PC FP\n FN PW')
        print(np.ndarray(shape = (2,2), buffer = np.array([100 *(cm[0][0] + cm[1][1])/length, \
           100 * cm[0][1]/length, 100 * cm[1][0]/length, 100 * (cm[1][0] + cm[0][1])/length])))

          

predNB_Y = modelNB.predict(testNB_X)
evaluate_predictions(testNB_Y, predNB_Y)


## Save a trained model

In [None]:
from joblib import dump, load
dump(modelNB, 'modelNB.joblib') 


## Load a saved model

In [None]:
modelNB2 = load('modelNB.joblib')
predNB_Y = modelNB2.predict(testNB_X)

evaluate_predictions(testNB_Y, predNB_Y)


## LAB 1: ## 

### Do a similar set of steps as Naive Bayes but this time use a Decision Tree algorithm

#### 1. Import the correct model to do DecisionTree
#### 2. Create an instance of the model
#### 3.	Train the model using the training sets
#### 4. Explore the results

<br>
<details><summary>Click for <b>hint</b></summary>
<p>
<b>dir</b> sklearn.tree package to find the right name of the model class.
<br>
<b>help</b> the class name to explore the parameters. We can pass none in this case.
<br>
<b>fit</b> the empty model to train it.
<br>
Use the helper function to analyze the results. Which model did a better job?
<br>
<br>
</p>
</details>


<details><summary>Click for <b>code</b></summary>
<p>

```python
from sklearn.tree import DecisionTreeClassifier
dfDT = df

trainDT_X, testDT_X, trainDT_Y, testDT_Y = trainNB_X, testNB_X, trainNB_Y, testNB_Y

modelDT = DecisionTreeClassifier()
modelDT.fit(trainDT_X, trainDT_Y)
predDT_Y = modelDT.predict(testDT_X)
evaluate_predictions(testDT_Y, predDT_Y)

```
</p>
</details>

## Train the Decision Tree model

In [None]:
from sklearn.tree import ???

# copy the same datasets to the DT names just to keep a copy for the future
dfDT, trainDT_X, testDT_X, trainDT_Y, testDT_Y = df, trainNB_X, testNB_X, trainNB_Y, testNB_Y



## Decision Trees have another option to let you see what are the most important features influencing the decisions. The following helper function makes it easier to view.

In [None]:
def important_features(model, columns):
    return pd.DataFrame(model.feature_importances_, columns=['Importance'], index = columns).sort_values(['Importance'], ascending = False)
 
print(important_features(modelDT, trainDT_X.columns))


## Prepare the data
### Logistic Regression requires categorical data be dummy encoded

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp

def dummy_code(data, columns, drop_first = True):
    for c in columns:
        dummies = pd.get_dummies(data[c], prefix = c, drop_first = drop_first)
        i = list(data.columns).index(c)
        data = pd.concat([data.iloc[:,:i], dummies, data.iloc[:,i+1:]], axis = 1)
    return data

dfLR = dummy_code(df, ['type'], drop_first = True)
trainLR_X, testLR_X, trainLR_Y, testLR_Y = train_test_split(dfLR.iloc[:,dfLR.columns != 'isFraud'], dfLR.isFraud, train_size = train_size, test_size = test_size, random_state = 1)

print(testLR_X.columns)
display(testLR_X.head())


## Create a Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression(multi_class='auto', solver='lbfgs')
modelLR.fit(trainLR_X, trainLR_Y)
print(modelLR.coef_)

## Examine the results of Logistic Regression

In [None]:
%matplotlib inline
import numpy as np
predLR_Y = modelLR.predict(testLR_X)

score = modelLR.score(testLR_X, testLR_Y)
mse = np.mean((predLR_Y - testLR_Y)**2)
print(score, mse, '\n')

evaluate_predictions(testLR_Y, predLR_Y)


## Logistic regression has another option called predict_proba() that can be used to set a custom threshold rather than the default.

In [None]:
predLR_Y1 = modelLR.predict_proba(testLR_X)
display(predLR_Y1)

from sklearn.metrics import roc_auc_score, roc_curve
roc = roc_auc_score(testLR_Y, predLR_Y)
fpr, tpr, x = roc_curve(testLR_Y, predLR_Y1[:,1])

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label = 'AUC = ' + str(roc))
plt.legend(loc=4)
plt.show()


## Try Logistic Regression with different probability thresholds to change ratio of false negatives and positives

In [None]:
predLR_Y = modelLR.predict_proba(testLR_X)
print(predLR_Y[:10])
print('Score', modelLR.score(testLR_X, testLR_Y))

for threshold in range(10, 91, 10):
    predLR_Y1 = np.where(predLR_Y[:,0] >= threshold/100, 0, 1)
    mse = np.mean((predLR_Y1 - testLR_Y)**2)
    print ('\nTHRESHOLD', threshold, 'MSE', mse)

    evaluate_predictions(testLR_Y, predLR_Y1, show_percent = False)



## Prepare the data for a Neural Network
### This time you should not drop the first column when dummy encoding. Additionally, data works better if it is rescaled.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
# rescale the data
dfNN = dummy_code(df, ['type'], drop_first = False)
print(dfNN.columns)
dfNN[['amount',  'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']] /= dfNN[['amount',  'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].max()
trainNN_X, testNN_X, trainNN_Y, testNN_Y = train_test_split(dfNN.iloc[:,dfNN.columns != 'isFraud'], dfNN.isFraud, train_size = train_size, test_size = test_size, random_state = 1)



## Create a Neural Network model
This is running very slow here so let's not do it

In [None]:
from sklearn.neural_network import MLPClassifier
modelNN = MLPClassifier(hidden_layer_sizes = (5, 3, 2), activation = 'logistic')
modelNN.fit(trainNN_X, trainNN_Y)

## Examine the results of Neural Network predictions

In [None]:
predNN_Y = modelNN.predict(testNN_X)

evaluate_predictions(testNN_Y, predNN_Y, show_percent = False)



## Create a SVM model

In [None]:
from sklearn import svm
train_size = .03
test_size = .01
dfSVM = dfNN
trainSVM_X, testSVM_X, trainSVM_Y, testSVM_Y = train_test_split(dfSVM.iloc[:,dfSVM.columns != 'isFraud'], dfSVM.isFraud, train_size = train_size, test_size = test_size)

def do_SVM(kernel, gamma):
    print ("\nKernel:", kernel, "Gamma:", gamma)
    modelSVM = svm.SVC(gamma = gamma,  kernel = kernel)
    modelSVM.fit(trainSVM_X, trainSVM_Y)
    print (modelSVM.score(testSVM_X, testSVM_Y))

    predSVM_Y = modelSVM.predict(testSVM_X)
    evaluate_predictions(testSVM_Y, predSVM_Y, show_percent = False)
    
do_SVM('linear', gamma='auto')

for kernel in ['rbf', 'poly', 'sigmoid']:
    for gamma in ['auto', 10, 100]:
        if not (kernel == 'poly' and gamma == 100):
           do_SVM(kernel, gamma)



## Ensemble Learning

## Create and train a Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
modelRF = RandomForestClassifier(n_estimators=10)
trainRF_X, trainRF_Y, testRF_X, testRF_Y = trainDT_X, trainDT_Y, testDT_X, testDT_Y
modelRF.fit(trainRF_X, trainRF_Y)

## Test the accuracy of the predictions and examine important features

In [None]:
predRF_Y = modelRF.predict(testRF_X)
from sklearn import metrics
print ("Accuracy:",metrics.accuracy_score(testRF_Y, predRF_Y))

cm = confusion_matrix(testRF_Y, predRF_Y)
print (cm)

import pandas as pd
feature_imp = pd.Series(modelRF.feature_importances_,index=trainRF_X.columns).sort_values(ascending=False)
print (feature_imp)

## Visualize important features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()


## Try removing less important features and retrain it

In [None]:
newTrainRF_X = trainRF_X[['newbalanceDest', 'oldbalanceOrg', 'amount', 'oldbalanceDest']]
newTestRF_X = testRF_X[['newbalanceDest', 'oldbalanceOrg', 'amount', 'oldbalanceDest']]
from sklearn.ensemble import RandomForestClassifier
modelRF = RandomForestClassifier(n_estimators=10)
modelRF.fit(newTrainRF_X, trainRF_Y)

### In this case the accuracy did not go up, but in many cases it does

In [None]:
newpredRF_Y = modelRF.predict(newTestRF_X)
from sklearn import metrics
print ("Accuracy:",metrics.accuracy_score(testRF_Y, newpredRF_Y))
cm = confusion_matrix(testRF_Y, newpredRF_Y)
print (cm)

import pandas as pd
feature_imp = pd.Series(modelRF.feature_importances_,index=newTrainRF_X.columns).sort_values(ascending=False)
print (feature_imp)

### Voting Classifier will run all the specified models and choose the result based on voting among the models

In [None]:
from sklearn.ensemble import VotingClassifier
modelVC = VotingClassifier(estimators=[('dt', modelDT), ('nb', modelNB)], voting='hard')
modelVC.fit(trainDT_X, trainDT_Y)


In [None]:
print(modelVC.score(testDT_X, testDT_Y))
predVC_Y = modelVC.predict(testDT_X)
evaluate_predictions(testDT_Y, predVC_Y, show_percent = False)
