In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv(r'C:\Users\sakir\OneDrive\Masaüstü\career\akbank\globalaihub2\data\processed\fraud_detection2.csv')
df

Unnamed: 0,category,amt,gender,city,state,city_pop,job,is_fraud,hour,age,is_night_transaction,transaction_speed,distance_to_merchant
0,grocery_net,36.68,M,Birmingham,AL,493806,Musician,0,9,39,0,,80.874467
1,travel,6.94,M,Boulder,MT,1939,Patent attorney,1,22,53,1,11798126.0,52.287253
2,gas_transport,41.96,M,Cleveland,AL,3996,Aid worker,0,0,43,1,-2066814.0,25.509482
3,shopping_net,992.69,M,Powell,TN,25459,"Teacher, special educational needs",1,22,51,1,8805055.0,67.055703
4,health_fitness,21.37,M,Saint James City,FL,3776,Sport and exercise psychologist,1,22,43,1,-25745360.0,78.931562
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12721,grocery_pos,324.00,M,Curlew,IA,223,Estate manager/land agent,1,2,81,1,5445176.0,97.290181
12722,shopping_net,99.01,F,Bailey,NC,6629,Call centre manager,0,2,57,1,-35768945.0,111.905669
12723,home,117.19,F,Joliet,IL,128354,"Teacher, secondary school",0,20,28,0,-368334.0,85.482966
12724,grocery_pos,139.62,M,Paint Rock,AL,653,Paediatric nurse,0,3,54,1,23702066.0,105.106569


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12726 entries, 0 to 12725
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   category              12726 non-null  object 
 1   amt                   12726 non-null  float64
 2   gender                12726 non-null  object 
 3   city                  12726 non-null  object 
 4   state                 12726 non-null  object 
 5   city_pop              12726 non-null  int64  
 6   job                   12726 non-null  object 
 7   is_fraud              12726 non-null  int64  
 8   hour                  12726 non-null  int64  
 9   age                   12726 non-null  int64  
 10  is_night_transaction  12726 non-null  int64  
 11  transaction_speed     12725 non-null  float64
 12  distance_to_merchant  12726 non-null  float64
dtypes: float64(3), int64(5), object(5)
memory usage: 1.3+ MB


In [5]:
# Standartlaştırma Normalizasyon MinMaxScaling LogTransform
# geniş aralıklarda çarpıklık olduğu için kullanılır.
df['amt'] = np.log1p(df['amt'])
df['city_pop'] = np.log1p(df['city_pop'])

# 0-1 aralığına çekmek için
scaler = StandardScaler()
df['hour'] = scaler.fit_transform(df[['hour']])
df['age'] = scaler.fit_transform(df[['age']])
df['transaction_speed'] = scaler.fit_transform(df[['transaction_speed']])
df['distance_to_merchant'] = scaler.fit_transform(df[['distance_to_merchant']])

In [6]:
df['transaction_speed'].fillna(df['transaction_speed'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transaction_speed'].fillna(df['transaction_speed'].mean(), inplace=True)


In [7]:
# Label Encoding
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])

In [8]:
df.head()


Unnamed: 0,category,amt,gender,city,state,city_pop,job,is_fraud,hour,age,is_night_transaction,transaction_speed,distance_to_merchant
0,3,3.629129,1,73,0,13.1099,305,0,-0.512239,-0.489345,0,-4.589213e-18,0.197482
1,13,2.071913,1,83,25,7.570443,327,1,1.029932,0.282287,1,0.6228508,-0.791431
2,2,3.760269,1,153,0,8.293299,17,0,-1.579896,-0.268879,1,-0.1090853,-1.71775
3,11,6.901425,1,655,41,10.144864,452,1,1.029932,0.172054,1,0.4648453,-0.280548
4,5,3.107721,1,715,8,8.236685,429,1,1.029932,-0.268879,1,-1.359086,0.130272


In [9]:
X = df.drop(['is_fraud'], axis=1)
y = df['is_fraud']

In [10]:
# Veri setini train, validation ve test olarak ayır
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print("XGBoost Model")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
# Modeli değerlendirmek için test setini kullan
y_test_pred = model.predict(X_test)
print("Test Set Evaluation")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Model
Accuracy: 0.9790466212676794
Precision: 0.9757575757575757
Recall: 0.9837067209775967
F1 Score: 0.9797160243407708
Confusion Matrix:
 [[903  24]
 [ 16 966]]
Test Set Evaluation
Accuracy: 0.9743321110529073
Precision: 0.9745762711864406
Recall: 0.9735449735449735
F1 Score: 0.9740603493912123
Confusion Matrix:
 [[940  24]
 [ 25 920]]
