# 1.) Import the Credit Card Fraud Data From CCLE

In [1]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np

In [2]:
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [3]:
df = pd.read_csv("/content/gdrive/MyDrive/fraudTest.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


# 2.) Select four columns to use as features (one just be trans_date_trans)

In [5]:
data=df[["trans_date_trans_time","amt","gender","city_pop","is_fraud"]]

In [6]:
data.head()

Unnamed: 0,trans_date_trans_time,amt,gender,city_pop,is_fraud
0,2020-06-21 12:14:25,2.86,M,333497,0
1,2020-06-21 12:14:33,29.84,F,302,0
2,2020-06-21 12:14:53,41.28,F,34496,0
3,2020-06-21 12:15:15,60.05,M,54767,0
4,2020-06-21 12:15:17,3.19,M,1126,0


# 3.) Create a unique variable out of trans_date.

In [7]:
df["trans_date_trans_time"]=pd.to_datetime(df["trans_date_trans_time"])

In [8]:
data["time_var"]=[i.minute for i in df["trans_date_trans_time"]]
data["time_var"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["time_var"]=[i.minute for i in df["trans_date_trans_time"]]


0    14
1    14
2    14
3    15
4    15
Name: time_var, dtype: int64

In [9]:
data.pop("trans_date_trans_time")
dummies=pd.get_dummies(data,["gender"])
X=pd.concat([dummies,data[["amt","city_pop","time_var"]]],axis=1)
y=data["is_fraud"]

In [10]:
data.head()

Unnamed: 0,amt,gender,city_pop,is_fraud,time_var
0,2.86,M,333497,0,14
1,29.84,F,302,0,14
2,41.28,F,34496,0,14
3,60.05,M,54767,0,15
4,3.19,M,1126,0,15


# 4.) Oversample the data (this will be your training data).

In [11]:
#xx

# 5.) Train a Logistic regression.

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
clf = LogisticRegression()
log_reg=clf.fit(X_train, y_train)

# 6.) The company you are working for wants to target at a False Positive rate of 5% what threshold should you use? (Use oversampled data)

In [14]:
from sklearn.metrics import confusion_matrix 

In [15]:
y_prob=clf.predict_proba(X_test)

In [16]:
target_fn_percentage=5
threshold=np.percentile(y_prob[:1],100-target_fn_percentage)

In [17]:
y_pred=(y_prob[:,1]>threshold).astype(int)

In [18]:
confusion_matrix(y_test,y_pred)

array([[138361,     21],
       [   548,      0]])

In [19]:
threshold

0.9472766284105507

# 7.) If the company makes .02*amt on True transactions and loses -amt on False (Use original data)

In [20]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tpr=tp/(tp+fn)
fpr=fp/(fp+tn)

In [21]:
df_temp=data.copy()
df_temp

Unnamed: 0,amt,gender,city_pop,is_fraud,time_var
0,2.86,M,333497,0,14
1,29.84,F,302,0,14
2,41.28,F,34496,0,14
3,60.05,M,54767,0,15
4,3.19,M,1126,0,15
...,...,...,...,...,...
555714,43.77,M,519,0,59
555715,111.84,M,28739,0,59
555716,86.88,F,3684,0,59
555717,7.99,M,129,0,59


In [22]:
df_temp["pred"]=log_reg.predict(X)

In [30]:
df_temp=df_temp[["pred","is_fraud","amt"]]
df_temp.head()

Unnamed: 0,pred,is_fraud,amt
0,0,0,2.86
1,0,0,29.84
2,0,0,41.28
3,0,0,60.05
4,0,0,3.19


In [54]:
df_temp.loc[(df['is_fraud'] == 1) & (df_temp['pred'] == 1), 'profit'] = 0.02*df['amt']
df_temp.loc[(df['is_fraud'] == 0) & (df_temp['pred'] == 0), 'profit'] = 0.02*df['amt']
df_temp.loc[(df['is_fraud'] == 1) & (df_temp['pred'] == 0), 'profit'] = -df['amt']
df_temp.loc[(df['is_fraud'] == 0) & (df_temp['pred'] == 1), 'profit'] = -df['amt']

In [55]:
df_temp

Unnamed: 0,pred,is_fraud,amt,profit
0,0,0,2.86,0.0572
1,0,0,29.84,0.5968
2,0,0,41.28,0.8256
3,0,0,60.05,1.2010
4,0,0,3.19,0.0638
...,...,...,...,...
555714,0,0,43.77,0.8754
555715,0,0,111.84,2.2368
555716,0,0,86.88,1.7376
555717,0,0,7.99,0.1598


In [56]:
total_profit=df_temp['profit'].sum()
total_profit

-1323827.0377999998

# 8.) Using Logistic Regression Lasso to inform you. Would you use the selected features in a trusted prediction model?

In [46]:
LR=LogisticRegression("l1",solver="liblinear")
LR.fit(X_train,y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [50]:
y_pred_lasso=LR.predict(X_test)
y_pred_lasso.sum()

548

In [53]:
y_prob_lasso=LR.predict_proba(X_test)
target_fn_percentage = 5
threshold = np.percentile(y_prob_lasso[:,1], 100-target_fn_percentage)
y_pred_lasso = (y_prob_lasso[:,1] > threshold).astype(int)
confusion_matrix(y_test, y_pred_lasso)

array([[131983,   6399],
       [     0,    548]])

Most variables are not 0, so I will use selected features in the prediction model.

In [None]:
#See if the variables all got 0 -> input data is terribel