In [None]:
#Example of supress warnings for Numpy version out of range (optional)
import warnings
warnings.filterwarnings("ignore", category=Warning)

#Some recommended libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.inspection import DecisionBoundaryDisplay

#Some recommended libraries for the text processing tasks
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

import pickle

from dask import dataframe as dd

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
df_cards = pd.read_csv('sd254_cards.csv')

In [None]:
df_trans = pd.read_csv('credit_card_transactions-ibm_v2.csv').iloc[:20000,:]

In [None]:
# df_trans.head()

In [None]:
# drop potentially sensitive and unhelpful features
df_cards.drop(columns=['Card Number', 'CVV', 'Expires'], inplace=True)

# encode categorical features, format numerical
df_cards_encoded = pd.get_dummies(df_cards, columns=['Card Type', 'Card Brand'])
df_cards_encoded['Has Chip'] = df_cards_encoded['Has Chip'].apply(lambda x: 1 if x.lower() == 'yes' else 0)
df_cards_encoded['Card on Dark Web'] = df_cards_encoded['Card on Dark Web'].apply(lambda x: 1 if x.lower() == 'yes' else 0)
df_cards_encoded['Credit Limit'] = df_cards_encoded['Credit Limit'].str.replace('$', '').astype(float)

# handle date columns
df_cards_encoded['Acct Open Date'] = pd.to_datetime(df_cards_encoded['Acct Open Date'])
df_cards_encoded['Year'] = df_cards_encoded['Acct Open Date'].dt.year
df_cards_encoded['Month'] = df_cards_encoded['Acct Open Date'].dt.month
df_cards_encoded['Day'] = df_cards_encoded['Acct Open Date'].dt.day
df_cards_encoded['Acct Age (Years)'] = 2019 - df_cards_encoded['Acct Open Date'].dt.day / 365
df_cards_encoded.drop(columns='Acct Open Date', inplace=True)

df_cards_encoded['Years Since PIN Last Changed'] = 2019 - df_cards_encoded['Year PIN last Changed']

# scale features
scaler = StandardScaler()
df_cards_encoded[['Credit Limit', 'Years Since PIN Last Changed']] = scaler.fit_transform(df_cards_encoded[['Credit Limit', 'Years Since PIN Last Changed']])

# create ID column
df_cards_encoded['ID'] = df_cards_encoded['User'] + df_cards_encoded['CARD INDEX']

# detect outliers
# plot = sns.scatterplot(x='Credit Limit', y='Cards Issued', data=df_cards_encoded)
# plt.show()

In [None]:
# df_cards_encoded

In [None]:
df_trans_encoded = df_trans.copy()
# drop merchant name field
df_trans_encoded = df_trans_encoded.drop(columns='Merchant Name')
# fill NaN with 'None'
df_trans_encoded = df_trans_encoded.fillna('None')
# select the first million rows

In [None]:
# df_trans_encoded.head()

In [None]:
# encode categorical values
df_trans_encoded = pd.get_dummies(df_trans_encoded, columns=['Use Chip', 'Merchant City', 'Merchant State', 'Errors?'])

In [None]:
df_trans_encoded['Amount'] = df_trans_encoded['Amount'].str.replace('$', '').astype(float)
df_trans_encoded['Time'] = pd.to_datetime(df_trans_encoded['Time'])
df_trans_encoded['Time'] = df_trans_encoded['Time'].dt.hour+df_trans_encoded['Time'].dt.minute/60


In [None]:
df_trans_encoded['ID'] = df_trans_encoded['User'] + df_trans_encoded['Card']

In [None]:
df_trans_encoded['Is Fraud?'] = df_trans_encoded['Is Fraud?'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

In [None]:
scaler = StandardScaler()
df_trans_encoded[['Amount', 'Time']] = scaler.fit_transform(df_trans_encoded[['Amount', 'Time']])

In [None]:
df_trans_encoded.replace('None', 0, inplace=True)

In [None]:
# ddf_trans_encoded = dd.from_pandas(df_trans_encoded, npartitions=4)
# ddf_cards = dd.from_pandas(df_cards, npartitions=4)

# ddf_all = ddf_trans_encoded.join(ddf_cards.set_index('CARD INDEX'), on='Card', lsuffix='_trans', rsuffix='_card')

# df_all = ddf_all.compute()
df_all = df_trans_encoded.merge(df_cards_encoded, on='ID')

In [None]:
# # print(df_all[df_all['Cards Issued'] > 1]['Is Fraud?'].value_counts())
# # print(df_all[df_all['Cards Issued'] == 1]['Is Fraud?'].value_counts())

# plt.scatter(df_all['Amount'], df_all['Is Fraud?'])
# plt.xlabel('Amount')
# plt.ylabel('Is Fraud?')
# plt.show()


# plt.scatter(df_all['Years Since PIN Last Changed'], df_all['Is Fraud?'])
# plt.xlabel('Years Since PIN Last Changed')
# plt.ylabel('Is Fraud?')
# plt.show()

# # for year in range(1,16):
# #     print(year)
# #     print(df_all[df_all['Years Since PIN Last Changed'] == year]['Is Fraud?'].value_counts())

# result = pd.crosstab(df_all['Years Since PIN Last Changed'], df_all['Is Fraud?'])
# result = result.rename(columns={0: 'Not Fraud', 1: 'Is Fraud'})
# result['Percent Fraud'] = result['Is Fraud']/result.sum(axis=1)*100
# print(result)
# plt.scatter(df_all['Acct Age (Years)'], df_all['Is Fraud?'])
# plt.xlabel('Account Age (Years)')
# plt.ylabel('Is Fraud?')
# plt.show

# greater_than_15 = pd.crosstab(df_all['Acct Age (Years)'] >= 15, df_all['Is Fraud?']);
# greater_than_15 = greater_than_15.rename(columns={0: 'Not Fraud', 1: 'Is Fraud'})
# greater_than_15['Percent Fraud'] = greater_than_15['Is Fraud']/len(df_all.index)*100
# print(greater_than_15)

# less_than_15 = pd.crosstab(df_all['Acct Age (Years)'] < 15, df_all['Is Fraud?']);
# less_than_15 = less_than_15.rename(columns={0: 'Not Fraud', 1: 'Is Fraud'})
# less_than_15['Percent Fraud'] = less_than_15['Is Fraud']/len(df_all.index)*100
# print(less_than_15)

In [None]:
# # plt.matshow(df_all.corr())
# # plt.show()
# # cb = plt.colorbar()
# # cb.ax.tick_params(labelsize=14)
# # plt.title('Correlation Matrix', fontsize=16);

# corrlation = df_all.corrwith(df_all['Is Fraud?'])
# corrlation.sort_values(ascending=False, inplace=True)
# print(corrlation)


In [None]:
pca = PCA()
df_pca = pca.fit_transform(df_all)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_pca, df_all['Is Fraud?'], test_size=0.2, random_state=42)

In [None]:
# parameters = {'C': np.logspace(-3,3,7), 'penalty': ["l1","l2"]}

# model = GridSearchCV(LogisticRegression(), parameters, verbose=3, cv=2)

# model.fit(X_train, y_train)

# print(model.best_params_)

# y_pred = model.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [None]:
# pickle.dump(model, open('logreg_model.pkl', 'wb'))

In [None]:
# param_grid = {
#     'C': [0.001, 0.01, 0.1, 1],
#     'kernel': ['linear', 'rbf'],
#     'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
#     'class_weight': ['balanced', None]
# }

# svm = RandomizedSearchCV(SVC(), param_grid, verbose=3, cv=2)
# svm.fit(X_train, y_train)

# print(svm.best_params_)

# y_pred = svm.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [None]:
# pickle.dump(svm, open('svm_model.pkl', 'wb'))

In [None]:
svm_cv = pickle.load(open('svm_model.pkl', 'rb'))
best_params = svm_cv.best_params_
print(best_params)
svm = svm_cv.best_estimator_

In [None]:
svm.feature_names = df_all.columns

In [None]:
y_pred = svm.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.average_precision_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)
print("Recall:", recall)

feature_coefficients = svm.coef_[0]
print(svm.feature_names[feature_coefficients.tolist().index(max(feature_coefficients))])
print((max(feature_coefficients)))
for i in range(len(df_all.columns)):
  if feature_coefficients[i] >= 0.001 or feature_coefficients[i] <= -0.001:
    print(f"{svm.feature_names[i]}: {feature_coefficients[i]}")

sns.heatmap(df_all.corr(), annot=True)
plt.show