<a href="https://colab.research.google.com/github/roitraining/PythonML/blob/Development/Ch07-ClassificationAnalysis/07-01-ClassificationAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Combine the multiple files into one big CSV since we could not load a large file to GitHub.

In [1]:
! ./combine.sh

### Read in a set of data and examine it

In [2]:
import pandas as pd
df = pd.read_csv('CreditCardFraud.csv')

print (df.shape, df.columns)
train_size = .3
test_size = .1

print (df.head())
print (df.isFraud.value_counts())
print (df.type.value_counts())


(6362620, 11) Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.

### Keep the columns we want and change the type to code numbers instead

In [0]:
columns = ['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud', 'isFraud']
df = df[columns]
df.type = pd.Categorical(df.type).codes
print (df.shape, df.columns)
print (df.head())


### Prepare train & test sets with desired columns

In [0]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
trainX, testX, trainY, testY = train_test_split(df[df.columns[:-1]], df.isFraud, train_size = train_size, test_size = test_size)
print (testY.value_counts())
print(trainY.value_counts()/trainY.count())
print(testY.value_counts()/testY.count())
print (trainX[:10])

## Create a Naive Bayes model

In [0]:
from sklearn.naive_bayes import GaussianNB
modelNB = GaussianNB()
modelNB.fit(trainX, trainY)

### Examine the results of Naive Bayes

In [0]:
predY = modelNB.predict(testX)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(testY, predY)
print (cm)

# helper function to print confusion matrix as percentages
def cm_percent(cm, length, legend = True):
    import numpy as np
    if legend:
       print (' PC', 'FP\n', 'FN', 'PW')
    return np.ndarray(shape = (2,2), buffer = np.array([100 *(cm[0][0] + cm[1][1])/length,
       100 * cm[0][1]/length, 100 * cm[1][0]/length, 100 * (cm[1][0] + cm[0][1])/length]))

cmp = cm_percent(cm, len(testY))
print (cmp)
print (testY.value_counts())
print (len(testY))


## Save a trained model

In [0]:
from joblib import dump, load
dump(modelNB, 'modelNB.joblib') 


## Load a saved model

In [0]:
modelNB2 = load('modelNB.joblib')
predY = modelNB2.predict(testX)
cm = confusion_matrix(testY, predY)
print (cm)
cmp = cm_percent(cm, len(testY))
print (cmp)

## Train the Decision Tree model

In [0]:
from sklearn.tree import DecisionTreeClassifier
modelDT = DecisionTreeClassifier()
modelDT.fit(trainX, trainY)


## Examine the results of the Decision Tree

In [0]:
def important_features(model, columns):
    return pd.DataFrame(model.feature_importances_, columns=['Importance'], index = columns).sort_values(['Importance'], ascending = False)
 
predY = modelDT.predict(testX)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(testY, predY)
print (cm)
print (cm_percent(cm, len(testY)))
print (testY.value_counts(), len(testY))
print (important_features(modelDT, trainX.columns))



## Create and train a Random Forest Classifier

In [0]:
from sklearn.ensemble import RandomForestClassifier
modelRF = RandomForestClassifier(n_estimators=10)
modelRF.fit(trainX, trainY)

## Test the accuracy of the predictions and examine important features

In [0]:
predY = modelRF.predict(testX)
from sklearn import metrics
print ("Accuracy:",metrics.accuracy_score(testY, predY))
cm = confusion_matrix(testY, predY)
print (cm)

import pandas as pd
feature_imp = pd.Series(modelRF.feature_importances_,index=trainX.columns).sort_values(ascending=False)
print (feature_imp)

## Visualize important features

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()


## Try removing less important features and retrain it

In [0]:
newTrainX = trainX[['newbalanceDest', 'oldbalanceOrg', 'amount', 'oldbalanceDest']]
newTestX = testX[['newbalanceDest', 'oldbalanceOrg', 'amount', 'oldbalanceDest']]
from sklearn.ensemble import RandomForestClassifier
modelRF = RandomForestClassifier(n_estimators=10)
modelRF.fit(newTrainX, trainY)

### In this case the accuracy did not go up, but in many cases it does

In [0]:
predY = modelRF.predict(newTestX)
from sklearn import metrics
print ("Accuracy:",metrics.accuracy_score(testY, predY))
cm = confusion_matrix(testY, predY)
print (cm)

import pandas as pd
feature_imp = pd.Series(modelRF.feature_importances_,index=newTrainX.columns).sort_values(ascending=False)
print (feature_imp)

## Prepare the data
### Logistic Regression requires categorical data be dummy encoded

In [0]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
def dummy_code(data, columns, drop_first = True):
    for c in columns:
        dummies = pd.get_dummies(data[c], prefix = c, drop_first = drop_first)
        i = list(data.columns).index(c)
        data = pd.concat([data.iloc[:,:i], dummies, data.iloc[:,i+1:]], axis = 1)
    return data

df2 = dummy_code(df, ['type'], drop_first = True)
trainX, testX, trainY, testY = train_test_split(df2.iloc[:,df2.columns != 'isFraud'], df2.isFraud, train_size = train_size, test_size = test_size)

print (testX.columns)
print (testX.head())


## Create a Logistic Regression model

In [0]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression(multi_class='auto', solver='lbfgs')
modelLR.fit(trainX, trainY)
print(modelLR.coef_)

## Examine the results of Logistic Regression

In [0]:
%matplotlib inline
import numpy as np
predY = modelLR.predict(testX)
from sklearn.metrics import confusion_matrix
score = modelLR.score(testX, testY)
mse = np.mean((predY - testY)**2)
print (score, mse)

cm = confusion_matrix(testY, predY)
print (cm)
cmp = cm_percent(cm, len(testY))
print (cmp)

predY1 = modelLR.predict_proba(testX)

from sklearn.metrics import roc_auc_score, roc_curve
roc = roc_auc_score(testY, predY)
fpr, tpr, x = roc_curve(testY, predY1[:,1])

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label = 'AUC = ' + str(roc))
plt.legend(loc=4)
plt.show()

#import scikitplot.metrics as skplt
#import matplotlib.pyplot as plt
#skplt.plot_roc(testY, predY1)
#plt.show()


## Try Logistic Regression with different probability thresholds to change ratio of false negatives and positives

In [0]:
predY = modelLR.predict_proba(testX)
print (predY[:10])
print ('Score', modelLR.score(testX, testY))

for threshold in range(30, 91, 10):
    predY1 = np.where(predY[:,1] >= threshold/100, 1, 0)
    mse = np.mean((predY1 - testY)**2)
    cm = confusion_matrix(testY, predY1)
    print ('\nTHRESHOLD', threshold, 'MSE', mse)
    print (cm)
    print (cm_percent(cm, len(testY), legend = False))


## Prepare the data for a Neural Network
### This time you should not drop the first column when dummy encoding. Additionally, data works better if it is rescaled.

In [0]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
# rescale the data
df2 = dummy_code(df, ['type'], drop_first = False)
print (df2.columns)
df2[['amount',  'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']] /= df2[['amount',  'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].max()
trainX, testX, trainY, testY = train_test_split(df2.iloc[:,df2.columns != 'isFraud'], df2.isFraud, train_size = train_size, test_size = test_size)



## Create a Neural Network model

In [0]:
from sklearn.neural_network import MLPClassifier
modelNN = MLPClassifier(hidden_layer_sizes = (5, 3, 2), activation = 'logistic')
modelNN.fit(trainX, trainY)

## Examine the results of Neural Network predictions

In [0]:
predY = modelNN.predict(testX)
cm = confusion_matrix(testY, predY)
print (cm)
cmp = cm_percent(cm, len(testY))
print (cmp)


## Create a SVM model

In [0]:
from sklearn import svm
train_size = .03
test_size = .01
trainX, testX, trainY, testY = train_test_split(df2.iloc[:,df2.columns != 'isFraud'], df2.isFraud, train_size = train_size, test_size = test_size)

def do_SVM(kernel, gamma):
    print ("\nKernel:", kernel, "Gamma:", gamma)
    modelSVM = svm.SVC(gamma = gamma,  kernel = kernel)
    modelSVM.fit(trainX, trainY)
    print (modelSVM.score(testX, testY))

    predY = modelSVM.predict(testX)
    cm = confusion_matrix(testY, predY)
    print (cm)

do_SVM('linear', gamma='auto')

for kernel in ['rbf', 'poly', 'sigmoid']:
    for gamma in ['auto', 10, 100]:
        if not (kernel == 'poly' and gamma == 100):
           do_SVM(kernel, gamma)



In [0]:
modelSVM = svm.SVC(gamma = 10)
modelSVM.fit(trainX, trainY)
print(modelSVM.score(testX, testY))


In [0]:
modelSVM = svm.SVC(gamma = 100)
modelSVM.fit(trainX, trainY)
print(modelSVM.score(testX, testY))


# End of notebook