In [None]:
#Credit Card Fraud Detection 
#The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset present 
#transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced,
#the positive class (frauds) account for 0.172% of all transactions.

#It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality
#issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are 
#the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'.
#Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 
#'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is 
#the response variable and it takes value 1 in case of fraud and 0 otherwise.

#Given the class imbalance ratio, we recommend measuring the accuracy using the Area Under the Precision-Recall Curve (AUPRC). 
#Confusion matrix accuracy is not meaningful for unbalanced classification.

#Key Learning Point: How to deal with imbalanced data?
#Learning Resource: http://contrib.scikit-learn.org/imbalanced-learn/index.html

In [1]:
from pandas import *
from numpy import *
from sklearn import *
import os



In [2]:
path = "C:/Users/rinlin/Desktop/DataScience"
os.chdir(path)

In [15]:
#read in data
credit_card = DataFrame.from_csv("creditcard.csv")

In [4]:
credit_card.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,1.768627e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,1.08885,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,-24.58826,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,-0.5354257,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,-0.09291738,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,0.4539234,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,23.74514,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [16]:
#Balancing Data
#Reference: https://www.quora.com/In-classification-how-do-you-handle-an-unbalanced-training-set
fraud = credit_card[credit_card['Class'] == 1]
normal = credit_card[credit_card['Class'] == 0]

# Undersample the normal transactions
percentage = len(fraud)*100.0/float(len(normal))
normal = normal.sample(frac=percentage)
credit_card = fraud.append(normal)

In [19]:
Attribute_Name = credit_card.columns.values
Input_Attribute = credit_card.as_matrix(columns=Attribute_Name[0:len(Attribute_Name)-1])
Output_Attribute = array(credit_card["Class"])
print Input_Attribute[1]
print Output_Attribute[1]

[ -3.04354062e+00  -3.15730712e+00   1.08846278e+00   2.28864362e+00
   1.35980513e+00  -1.06482252e+00   3.25574266e-01  -6.77936532e-02
  -2.70952836e-01  -8.38586565e-01  -4.14575448e-01  -5.03140860e-01
   6.76501545e-01  -1.69202893e+00   2.00063484e+00   6.66779696e-01
   5.99717414e-01   1.72532101e+00   2.83344830e-01   2.10233879e+00
   6.61695925e-01   4.35477209e-01   1.37596574e+00  -2.93803153e-01
   2.79798032e-01  -1.45361715e-01  -2.52773123e-01   3.57642252e-02
   5.29000000e+02]
1


In [20]:
#Split Trainset and Testset
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(Input_Attribute,Output_Attribute,test_size=0.4,
                                                                 random_state=0)
print X_train.shape,Y_train.shape
print X_test.shape,Y_test.shape

(29815L, 29L) (29815L,)
(19877L, 29L) (19877L,)


In [21]:
#Approach 1: Logistic Regression
lb = linear_model.LogisticRegression()
lb.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
Y_pred_LR = lb.predict(X_test)
print metrics.classification_report(Y_test,Y_pred_LR)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     19694
          1       0.94      0.82      0.87       183

avg / total       1.00      1.00      1.00     19877



In [23]:
#Approach 2: SVM
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
Y_pred = clf.predict(X_test)
print metrics.classification_report(Y_test,Y_pred)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     19694
          1       0.92      0.47      0.62       183

avg / total       0.99      0.99      0.99     19877

