In [None]:
import numpy as np
import pandas as pd

# **Data Prep**

**Reading the data**

In [None]:
df = pd.read_csv('fraudTrain.csv', encoding='latin-1')
df.shape

(1296675, 23)

In [None]:
y = df.iloc[:, -1]
print(y.value_counts())

is_fraud
0    1289169
1       7506
Name: count, dtype: int64


In [None]:
print(df.iloc[:, -1].apply(type).value_counts())

is_fraud
<class 'int'>    1296675
Name: count, dtype: int64


In [None]:
print(df.columns)

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


**Features selction**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split


df['transaction_hour'] = pd.to_datetime(df['trans_date_trans_time']).dt.hour
df['day_of_week'] = pd.to_datetime(df['trans_date_trans_time']).dt.dayofweek
df['transaction_amount_log'] = np.log(df['amt'] + 1)


df = pd.get_dummies(df, columns=['merchant', 'category', 'gender'], drop_first=True)


scaler = StandardScaler()
df[['amt', 'unix_time', 'merch_lat', 'merch_long']] = scaler.fit_transform(df[['amt', 'unix_time', 'merch_lat', 'merch_long']])


selected_features = ['transaction_hour', 'day_of_week', 'transaction_amount_log', 'city_pop', 'cc_num', 'merch_lat', 'merch_long']
X = df[selected_features]

In [None]:
print(X.shape)
print(y.shape)
print(X.head())  # If X is a DataFrame, otherwise print(X[:5])
print(y[:5])

(1296675, 7)
(1296675,)
   transaction_hour  day_of_week  transaction_amount_log  city_pop  \
0                 0            1                1.786747      3495   
1                 0            1                4.684259       149   
2                 0            1                5.398660      4154   
3                 0            1                3.828641      1939   
4                 0            1                3.760269        99   

             cc_num  merch_lat  merch_long  
0  2703186189652095  -0.494354    0.593864  
1      630423337322   2.078699   -2.030341  
2    38859492057661   0.902849   -1.592323  
3  3534093764340240   1.662886   -1.621848  
4   375534208663984   0.026941    0.841909  
0    0
1    0
2    0
3    0
4    0
Name: is_fraud, dtype: int64


# **Random Forest Model**

In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np

smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)


X = np.clip(X, a_min=0, a_max=None)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf, zero_division=1))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99    257186
           1       0.98      0.99      0.99    258482

    accuracy                           0.99    515668
   macro avg       0.99      0.99      0.99    515668
weighted avg       0.99      0.99      0.99    515668



# **Regression Model From Scratch**

In [None]:
def Sigmoid(z):

    sig = 1 / ( 1 + np.exp(-1*z) )
    return sig

In [None]:
def computeCostRegularise(X, y, theta,lamda_):
    n = X.shape[0]
    h = Sigmoid(np.matmul(X, theta))
    regularisation= np.sum(pow(theta[1:],2)) * (lamda_ / (2*n ))
    cost = -np.mean(y * np.log(h) + (1 - y) * np.log(1 - h)) + regularisation
    return cost

In [None]:
def is_close_to_zero(nombre, tolerance=0.00000000000001):
    return abs(nombre) <= tolerance

In [None]:
def lrCostFunction (X, y, initial_theta, alpha, MaxIter,lambda_):

    n = X.shape[0]
    cost_list = []
    theta = initial_theta
    costi = computeCostRegularise(X, y, theta,lambda_)
    nb=MaxIter
    while True:
      d = np.zeros((X.shape[1], 1))
      h = Sigmoid(np.matmul(X, theta))

      d =  np.dot(X.T, (h - np.reshape(y, h.shape))) / n

      theta_prime_list = []
      for j in range (theta.shape[0]):
        theta_prime_list.append(theta[j][0] - alpha * ( d[j][0]) + (lambda_ * theta[j][0]  /n ) )

      for j in range (theta.shape[0]):
         theta[j][0] = theta_prime_list[j]

      costj = computeCostRegularise(X, y, theta,lambda_)
      cost_list.append(costj)

      nb -= 1
      if nb ==0 or is_close_to_zero(abs(costi - costj)):
        break
      else:
        costi=costj

    return theta,cost_list


In [None]:
def predictOnveVsAll (all_theta, X):
    y_pred=np.zeros((X.shape[0], 1));
    for i in range(X.shape[0]):
      max_proba=0
      for j in range(all_theta.shape[0]):
        proba = Sigmoid(np.matmul(X[i][:],all_theta[j][:].T))
        if (proba > max_proba):
          max_proba=proba
          predected_calsse=j
      y_pred[i]=predected_calsse

    return y_pred

In [None]:
classes= np.unique(y)
number_classes=classes.shape[0]
all_theta = np.zeros((number_classes, X.shape[1]));
all_theta.shape

(2, 7)

In [None]:
X = np.array(X)
y = np.array(y)

In [None]:
MaxIter= 1000
lambda_= 0.1
alpha = 0.01

initial_theta=np.zeros((X.shape[1], 1));
for i in range (number_classes):
     # call to each classifier
     theta, costs = lrCostFunction(X[:5000, :],(y[:5000]==classes[i]).astype(int),initial_theta,alpha, MaxIter,lambda_)
     theta_reshaped = theta.flatten()
     all_theta[i, :] = theta_reshaped

In [None]:
y_pred = predictOnveVsAll(all_theta, X_test)

In [None]:
accuracy = np.mean(y_test==y_pred)*100
accuracy


99.56