<a href="https://colab.research.google.com/github/sanaa-sys/Fraud-Detection/blob/main/Bonus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,recall_score,precision_score, classification_report


In [None]:
data_train = pd.read_csv("fraudTrain.csv")
data_test = pd.read_csv("fraudTest.csv")

In [None]:
data = pd.concat([data_train, data_test], axis=0)

In [None]:
data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [None]:
data.drop(columns = [data.columns[0], data.columns[2], data.columns[6], data.columns[7], data.columns[9],  "trans_num"], inplace=True)


In [None]:
# Separate features and target
features = data.drop('is_fraud', axis=1)
target= data['is_fraud']
features.dropna()
features

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,1325376018,36.011293,-82.048315
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1325376044,49.159047,-118.186462
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,42.1808,-112.2620,4154,Nature conservation officer,1962-01-19,1325376051,43.150704,-112.154481
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,M,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,1325376076,47.034331,-112.561071
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,1325376186,38.674999,-78.632459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,2020-12-31 23:59:07,fraud_Reilly and Sons,health_fitness,43.77,M,Luray,MO,63453,40.4931,-91.8912,519,Town planner,1966-02-13,1388534347,39.946837,-91.333331
555715,2020-12-31 23:59:09,fraud_Hoppe-Parisian,kids_pets,111.84,M,Lake Jackson,TX,77566,29.0393,-95.4401,28739,Futures trader,1999-12-27,1388534349,29.661049,-96.186633
555716,2020-12-31 23:59:15,fraud_Rau-Robel,kids_pets,86.88,F,Burbank,WA,99323,46.1966,-118.9017,3684,Musician,1981-11-29,1388534355,46.658340,-119.715054
555717,2020-12-31 23:59:24,fraud_Breitenberg LLC,travel,7.99,M,Mesa,ID,83643,44.6255,-116.4493,129,Cartographer,1965-12-15,1388534364,44.470525,-117.080888


In [None]:
#Feature Engineering

In [None]:
#Calculate diatance between merchant and customer
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers
    lat1_rad, lon1_rad = radians(lat1), radians(lon1)
    lat2_rad, lon2_rad = radians(lat2), radians(lon2)
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

In [None]:
features["distance"] = features.apply(lambda row: haversine_distance(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

In [None]:
features.drop(columns = ["lat", "long", "merch_lat", "merch_long"], inplace = True)


In [None]:
features["dob"].dtype
features["dob"] = pd.to_datetime(data["dob"])


In [None]:
# Define a function to calculate age
from datetime import datetime
def calculate_age(dob):
    today = datetime.today()
    age = today.year - dob.year
    if (today.month, today.day) < (dob.month, dob.day):
        age -= 1
    return age

In [None]:
features['age'] = features['dob'].apply(calculate_age)
features.drop(columns = ["dob"], inplace = True)

In [None]:
features.job.nunique()

497

In [None]:
sector_bag = {
    "IT": ["engineer", "developer", "programmer", "software", "IT", "technician", "architect", "system", "network",
           "administrator", "data scientist", "cybersecurity", "web developer", "analyst", "database", "devops"],

    "Education": ["teacher", "professor", "educator", "trainer", "lecturer", "scientist", "Orthoptist", "tutor",
                  "principal", "instructor", "counselor", "academic", "researcher", "dean", "school", "headmaster"],

    "Healthcare": ["doctor", "nurse", "medical", "therapist", "pharmacist", "health", "surgeon", "dentist", "clinician",
                   "physician", "optometrist", "radiologist", "paramedic", "midwife", "veterinarian", "psychiatrist", "counselling"],

    "Finance": ["analyst", "accountant", "auditor", "banker", "financial", "investment", "controller", "broker",
                "consultant", "treasurer", "loan officer", "trader", "actuary", "economist", "portfolio", "credit"],

    "Marketing": ["manager", "executive", "specialist", "consultant", "advertising", "public relations", "strategist",
                  "director", "coordinator", "brand", "SEO", "content", "digital", "market research", "social media",
                  "copywriter"],

    "Manufacturing": ["operator", "mechanic", "assembler", "fabricator", "engineer", "technician", "welder",
                      "planner", "quality", "machinist", "production", "inspector", "supervisor", "foreman",
                      "toolmaker", "CNC"],

    "Retail": ["cashier", "salesperson", "store", "associate", "manager", "clerk", "shopkeeper", "merchandiser",
               "assistant", "retail", "customer service", "sales", "inventory", "buyer", "stocker", "checkout"],

      "Legal": ["lawyer", "attorney", "paralegal", "judge", "legal", "solicitor", "notary", "clerk", "litigator",
              "advocate", "barrister", "magistrate", "prosecutor", "defense", "compliance"],

    "Hospitality": ["chef", "waiter", "bartender", "host", "manager", "receptionist", "housekeeper", "concierge",
                    "caterer", "cook", "hotel", "tour guide", "event planner", "sous chef", "sommelier", "valet"],

    "Construction": ["builder", "carpenter", "electrician", "plumber", "architect", "project manager", "site manager",
                     "surveyor", "foreman", "bricklayer", "roofer", "civil engineer", "construction", "contractor",
                     "inspector", "draftsman"]
}

def assign_sector(x):
    for key in sector_bag:
        for role in sector_bag[key]:
            if x.find(role) != -1:
                return key
    return "Other"

In [None]:
features["job_sector"] = features["job"].apply(assign_sector)
features.drop(columns = ["job", "city_pop", "zip", "city"], inplace = True)

In [None]:
features.head(5)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,state,unix_time,is_fraud,distance,age,job_sector
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,NC,1325376018,0,78.597568,36,Healthcare
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,WA,1325376044,0,30.212176,46,Education
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,ID,1325376051,0,108.206083,62,Other
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,MT,1325376076,0,95.673231,57,Legal
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,VA,1325376186,0,77.556744,38,Healthcare


In [None]:
features["trans_date_trans_time"] = pd.to_datetime(features["trans_date_trans_time"])
features["is_weekend"] = features["trans_date_trans_time"].dt.day_name().apply(lambda x : int((x == "Friday") | (x == "Sunday") | (x == "saturday")))
def day_period(x):
    if x >=0 and x < 6: return "Night"
    elif x>= 6 and x <= 12: return "Morning"
    elif x> 12 and x <= 15: return "Afternoon"
    elif x> 15 and x <= 20: return "Evening"
    elif x> 20 and x <= 24: return "Night"


features["day_period"] = features["trans_date_trans_time"].dt.hour.apply(day_period)
features["trans_month"] = features["trans_date_trans_time"].dt.month_name()
features.drop(columns = ["trans_date_trans_time", "unix_time"], inplace = True)




In [None]:
features["merchant"].nunique()

693

In [None]:
# Step 1: Calculate fraud percentages for each merchant
merchant_groups = data.groupby('merchant')
fraud_percentage_dict = {}

for merchant, group in merchant_groups:
    total_trans = group.shape[0]
    total_fraud_trans = group[group["is_fraud"] == 1].shape[0]
    fraud_percentage_dict[merchant] = (total_fraud_trans / total_trans) * 100

# Step 2: Map the calculated percentages back to the DataFrame
features["fraud_merchant_pct"] = data["merchant"].map(fraud_percentage_dict)

In [None]:
features.drop(columns = ["merchant"], inplace = True)
features

Unnamed: 0,category,amt,gender,state,distance,age,job_sector,is_weekend,day_period,trans_month,fraud_merchant_pct
0,8,4.97,0,27,78.597568,36,3,0,3,4,1.357466
1,4,107.23,0,47,30.212176,46,1,0,3,4,0.992063
2,0,220.11,1,13,108.206083,62,8,0,3,4,0.189251
3,2,45.00,1,26,95.673231,57,5,0,3,4,0.241611
4,9,41.96,1,45,77.556744,38,3,0,3,4,0.305677
...,...,...,...,...,...,...,...,...,...,...,...
555714,5,43.77,1,24,77.026148,58,6,0,3,2,0.246002
555715,7,111.84,1,43,100.074420,24,2,0,3,2,0.182482
555716,7,86.88,0,47,80.759302,42,8,0,3,2,0.182260
555717,13,7.99,1,13,52.933240,58,8,0,3,2,0.261780


In [None]:
#Model Training

In [None]:
encoder=LabelEncoder()
features['category']=encoder.fit_transform(data['category'])
features['gender']=encoder.fit_transform(features['gender'])
features['state']=encoder.fit_transform(features['state'])
features['job_sector']=encoder.fit_transform(features['job_sector'])
features['day_period']=encoder.fit_transform(features['day_period'])
features['trans_month']=encoder.fit_transform(features['trans_month'])


In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)
x_train

Unnamed: 0,category,amt,gender,state,distance,age,job_sector,is_weekend,day_period,trans_month,fraud_merchant_pct
1273644,4,166.80,1,33,118.568453,78,8,0,2,6,1.139839
601398,11,28.86,0,4,132.208394,53,8,0,1,11,0.988287
999645,2,37.93,0,44,31.778845,34,4,1,3,3,0.500263
1180310,9,18.70,1,15,133.995217,65,8,1,3,8,0.086319
213847,8,33.54,0,43,57.125396,53,3,0,3,11,1.271186
...,...,...,...,...,...,...,...,...,...,...,...
259178,9,2.33,0,6,113.247585,59,1,0,2,8,0.365579
117739,13,9.12,0,25,65.330250,37,8,0,1,1,0.166528
131932,6,118.27,0,32,35.642839,46,8,0,3,7,0.116788
671155,12,5.60,1,23,116.557377,82,8,1,2,10,0.684117


In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train

array([[-0.57037309,  0.60246272,  1.10100093, ...,  0.18988203,
         0.21417957,  1.24983817],
       [ 1.21910512, -0.256872  , -0.90826445, ..., -0.73648635,
         1.66662897,  0.94375783],
       [-1.08165258, -0.20036797, -0.90826445, ...,  1.1162504 ,
        -0.65729006, -0.04187526],
       ...,
       [-0.0590936 ,  0.3001319 , -0.90826445, ...,  1.1162504 ,
         0.50466945, -0.81635742],
       [ 1.47474486, -0.40177649,  1.10100093, ...,  0.18988203,
         1.37613909,  0.32944258],
       [ 0.96346537,  0.44005257, -0.90826445, ..., -0.73648635,
         0.50466945, -0.59905479]])

In [None]:
def evaluate_model(model, x_train, x_test, y_test, y_train):
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  accuracy = accuracy_score(y_test, y_pred)
  print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")
  print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
  print(".............")


In [None]:
xgb_model = xgb.XGBClassifier(random_state=42)
evaluate_model(xgb_model, x_train, x_test, y_test, y_train)

XGBClassifier Accuracy: 0.9980

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.86      0.74      0.80      1953

    accuracy                           1.00    370479
   macro avg       0.93      0.87      0.90    370479
weighted avg       1.00      1.00      1.00    370479

.............
