# Inter-Uni Datathon: Fraudulent Transaction Detection

#### Team Habaybi

---

## Part 1: Data Cleaning

In [417]:
# import required packages

import pandas as pd
import warnings
import numpy as np

warnings.filterwarnings('ignore')


# Import Train and Test Datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [418]:
train.describe()

Unnamed: 0,TransactionNumber,UserID,Age,NumDependents,Latitude,Longitude,UserTenure,IsFraud
count,10966.0,10966.0,10966.0,10966.0,10923.0,10923.0,10966.0,10966.0
mean,9117.500091,2483.885282,2751.845887,1.995714,-30.363255,141.254786,60.786157,0.364308
std,5293.66964,1429.402615,9730.988917,1.408035,6.962819,11.268395,34.254477,0.481257
min,1.0,1.0,-68.0,0.0,-41.640079,-112.02605,1.0,0.0
25%,4516.25,1253.0,26.0,1.0,-37.0201,142.702789,31.0,0.0
50%,9120.5,2471.0,34.0,2.0,-31.840233,144.9646,61.0,0.0
75%,13710.75,3727.0,43.0,3.0,-25.042261,145.612793,90.0,1.0
max,18277.0,5000.0,67000.0,4.0,57.85158,149.012375,119.0,1.0


In [419]:
# Show columns in dataset
train.columns

Index(['TransactionNumber', 'UserID', 'Age', 'Gender', 'Occupation',
       'EducationLevel', 'MaritalStatus', 'NumDependents', 'Income',
       'Expenditure', 'GiftsTransaction', 'TransactionDate', 'TransactionTime',
       'TransactionAmount', 'MerchantID', 'TransactionType',
       'TransactionLocation', 'DeviceType', 'Latitude', 'Longitude',
       'EmailDomain', 'Terrorism', 'UserTenure', 'IsFraud'],
      dtype='object')

##### Clean columns with Financial Data: Complete Currency Conversion to ensure data consistency

In [420]:

finance_cols = ['Income', 'Expenditure', 'GiftsTransaction', 'TransactionAmount']
finance_train = train[['Income', 'Expenditure', 'GiftsTransaction', 'TransactionAmount']]
finance_test = test[['Income', 'Expenditure', 'GiftsTransaction', 'TransactionAmount']]

def clean_cash(data):
    """
    cleans all columns which contain financial data. Including conversion to AUD, then conversion to float datatype.
    """
    finance_cols = ['Income', 'Expenditure', 'GiftsTransaction', 'TransactionAmount']

    for col in finance_cols:

        data.columns = data.columns.str.replace(' ', '')
        data[col] = data[col].str.replace("AU$", "AUD", regex=False)
        data[col] = data[col].str.replace(" ", "", regex=False)
        data[f'{col}2'] = data[col]
        data[f'{col}Currency'] = data[col].apply(lambda x: 'AUD' if 'AUD' in str(x) else ('AED' if 'AED' in str(x) else ('GBP' if '£' in str(x) else ('GBP' if 'GBP' in str(x) else ''))))
        data[f'{col}2'] = data[f'{col}2'].str.replace("AUD", "", regex=False).str.strip()
        data[f'{col}2'] = data[f'{col}2'].str.replace("AED", "", regex=False).str.strip()
        data[f'{col}2'] = data[f'{col}2'].str.replace("£", "", regex=False).str.strip()
        data[f'{col}2'] = data[f'{col}2'].str.replace("GBP", "", regex=False).str.strip()
        data[f'{col}2'] = data[f'{col}2'].astype(float).round(2)

    data = data.drop(data.columns[:4], axis=1)
    data.columns = data.columns.str.replace('2', '')


    for col in finance_cols:
        data.loc[data[f'{col}Currency'] == 'AED', col] = (data[col] * 0.41).round(2)
        data.loc[data[f'{col}Currency'] == 'GBP', col] = (data[col] * 1.96).round(2)

    data = data[finance_cols]

    return data

cash_train_cleaned = clean_cash(finance_train)
cash_test_cleaned = clean_cash(finance_test)


In [421]:
train = train.drop(columns=finance_cols)
train = pd.concat([train, cash_train_cleaned], axis=1)

test = test.drop(columns=finance_cols)
test = pd.concat([test, cash_test_cleaned], axis=1)

##### Clean DeviceType Column

In [422]:
 # cleaning device column
items_to_be_grouped = ["mob", "galaxys7", "iphone 15", "android", "smartphone"]

for device in items_to_be_grouped:
    train['DeviceType'] = np.where(train['DeviceType'] == device, "Mobile", train["DeviceType"])
    test['DeviceType'] = np.where(test['DeviceType'] == device, "Mobile", test["DeviceType"])

##### Clean Age Column

In [423]:
# cleaning Age column
train['Age'] = train['Age'].apply(lambda x: abs(x) if x < 0 else x / 1000 if x > 100 else x)
test['Age'] = test['Age'].apply(lambda x: abs(x) if x < 0 else x / 1000 if x > 100 else x)

train['Age'] = train['Age'].astype(int)
test['Age'] = test['Age'].astype(int)

In [424]:

replace_female = ["Female", "fem", "she", "woman", "isnotmale"]
replace_male = ["Male", "he", "man", "isnotfemale"]

def fix_gender(data):
    for index, value in data["Gender"].items():
        if value in replace_female:
            data.at[index, "Gender"] = "Female"
        elif value in replace_male:
            data.at[index, "Gender"] = "Male"

    return data

train = fix_gender(train)
test = fix_gender(test)


##### One-Hot Encode Terrorism Column

In [425]:
# converting terrorism column from bool to int
train["Terrorism"] = train["Terrorism"].astype(int)
test["Terrorism"] = test["Terrorism"].astype(int)

##### Clean Location

In [426]:

replace_canberra = ["Canberra", "canberra", "CBR", "Cbr"]
replace_darwin = ["Darwin", "Drw", "DRW", "darwin"]
replace_adelaide = ["Adelaide", "Adl", "Adelaide City", "adl"]
replace_sydney = ["Sydney", "SYD", "Syd", "sydney"]
replace_hobart = ["Hobart", "hobart", "HBT", "Hbt"]
replace_brisbane = ["Brisbane", "BNE", "brisbane", "bne", "Bne"]
replace_perth = ["Perth", "perth", "PTH", "pth", "Pth"]
replace_melbourne = ["Melbourne", "Melb", "melbourne", "Mel", "MLB", "Melburn"]

def replace_location(column, replace_canberra, replace_darwin, replace_adelaide, replace_sydney, replace_hobart, replace_brisbane, replace_perth, replace_melbourne):
    """
    Replace different versions of city to 1 format. Ensure uniformity
    """
    replace_dict = {}
    replace_dict.update({value: "Canberra" for value in replace_canberra})
    replace_dict.update({value: "Darwin" for value in replace_darwin})
    replace_dict.update({value: "Adelaide" for value in replace_adelaide})
    replace_dict.update({value: "Sydney" for value in replace_sydney})
    replace_dict.update({value: "Hobart" for value in replace_hobart})
    replace_dict.update({value: "Brisbane" for value in replace_brisbane})
    replace_dict.update({value: "Perth" for value in replace_perth})
    replace_dict.update({value: "Melbourne" for value in replace_melbourne})
    return column.replace(replace_dict)

train['TransactionLocation'] = replace_location(train['TransactionLocation'],
													replace_canberra,
													replace_darwin,
													replace_adelaide,
													replace_sydney,
													replace_hobart,
													replace_brisbane,
													replace_perth,
													replace_melbourne)

test['TransactionLocation'] = replace_location(test['TransactionLocation'],
													replace_canberra,
													replace_darwin,
													replace_adelaide,
													replace_sydney,
													replace_hobart,
													replace_brisbane,
													replace_perth,
													replace_melbourne)

##### Clean Transaction Time

In [427]:
# Function to clean time data
def clean_time_column(column):
    """Clean columns with dateTime to ensure uniformity"""
    # First, replace any invalid separators like '/' with ':'
    column = column.str.replace('/', ':', regex=False)

    # Try converting to datetime in 12-hour format first
    dt = pd.to_datetime(column, format='%I:%M:%S %p', errors='coerce')

    # If there are any NaT values, try converting to datetime in 24-hour format
    dt = dt.fillna(pd.to_datetime(column, format='%H:%M:%S', errors='coerce'))

    # Return as strings in 24-hour format
    return dt.dt.strftime('%H:%M:%S')

# Clean the 'Time' column
train['Cleaned_TransactionTime'] = clean_time_column(train['TransactionTime'])
test['Cleaned_TransactionTime'] = clean_time_column(test['TransactionTime'])



##### Divide Transaction Time into 4 different groups: Night, Morning, Afternoon and Night


In [428]:
# Define the start and end times for different parts of the day
night_start_time = pd.to_datetime('00:00:00').time()
night_end_time = pd.to_datetime('05:59:59').time()

morning_start_time = pd.to_datetime('06:00:00').time()
morning_end_time = pd.to_datetime('11:59:59').time()

afternoon_start_time = pd.to_datetime('12:00:00').time()
afternoon_end_time = pd.to_datetime('17:59:59').time()

evening_start_time = pd.to_datetime('18:00:00').time()
evening_end_time = pd.to_datetime('23:59:59').time()

# Convert the 'Cleaned_TransactionTime' column to datetime
train['Cleaned_TransactionTime'] = pd.to_datetime(train['Cleaned_TransactionTime'])
test['Cleaned_TransactionTime'] = pd.to_datetime(test['Cleaned_TransactionTime'])

# Function to categorize times of the day
def categorise_time_of_day(transaction_time):
    """
    categorise transaction time into a time of day:
    (night 00:00 -> 06:00)
    (morning 06:00 -> 12:00)
    (afternoon 12:00 -> 18:00)
    (evening 18:00 -> 00:00)
    """

    time_of_day = transaction_time.time()  # Extract time from datetime
    if morning_start_time <= time_of_day < morning_end_time:
        return 'Morning'
    elif afternoon_start_time <= time_of_day < afternoon_end_time:
        return 'Afternoon'
    elif evening_start_time <= time_of_day < evening_end_time:
        return 'Evening'
    else:
        return 'Night'

# Apply the categorization function to the 'Cleaned_TransactionTime' column
train['Transaction_Time_of_Day'] = train['Cleaned_TransactionTime'].apply(categorise_time_of_day)
# train = train.drop(columns='Cleaned_TransactionTime')
test['Transaction_Time_of_Day'] = test['Cleaned_TransactionTime'].apply(categorise_time_of_day)
# test = test.drop(columns='Cleaned_TransactionTime')


In [429]:
from geopy.geocoders import Nominatim

city_lat_long_mapping = {
    'Melbourne': {'Latitude': -37.8136, 'Longitude': 144.9631},
    'Sydney': {'Latitude': -33.8688, 'Longitude': 151.2093},
    'Brisbane': {'Latitude': -27.4698, 'Longitude': 153.0251},
    'Hobart': {'Latitude': -42.8821, 'Longitude': 147.3272},
    'Perth': {'Latitude': -31.9505, 'Longitude': 115.8605},
    'Adelaide': {'Latitude': -34.9285, 'Longitude': 138.6007},
    'Darwin': {'Latitude': -12.4634, 'Longitude': 130.8456},
    'Canberra': {'Latitude': -35.2809, 'Longitude': 149.1300}
}

# Function to fill missing lat/long based on city transaction occured in
def fill_missing_lat_long(row):
    if pd.isnull(row['Latitude']) or pd.isnull(row['Longitude']):
        city = row['TransactionLocation']
        if city in city_lat_long_mapping:
            row['Latitude'] = city_lat_long_mapping[city]['Latitude']
            row['Longitude'] = city_lat_long_mapping[city]['Longitude']
    return row

# Apply the function to your dataframe
train = train.apply(fill_missing_lat_long, axis=1)
train.head()



Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,TransactionDate,TransactionTime,...,EmailDomain,Terrorism,UserTenure,IsFraud,Income,Expenditure,GiftsTransaction,TransactionAmount,Cleaned_TransactionTime,Transaction_Time_of_Day
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,2023-03-12,12:25:57,...,jon44@disposable.com,0,113,1,28884.43,14610.61,2058.02,258.14,2024-09-14 12:25:57,Afternoon
1,9645,3386,34,Male,Student,High School,Married,4,2023-03-05,18:27:24,...,emilyreese@gmail.com,0,104,1,54919.07,39169.49,9740.63,34.94,2024-09-14 18:27:24,Evening
2,1145,2971,25,Male,Unemployed,Master,Married,2,2023-11-10,17:16:56,...,fordevan@gmail.com,0,105,0,74728.57,55873.76,2253.71,323.82,2024-09-14 17:16:56,Afternoon
3,15308,2925,25,Male,Professional,High School,Married,3,2023-10-07,00/34/17,...,kathleenlewis@tempmail.com,0,70,1,55712.62,36756.11,4335.7,13.32,2024-09-14 00:34:17,Night
4,14967,2339,38,Male,Professional,High School,Single,4,2023-09-22,06:40:08 PM,...,kristinawhite@gmail.com,0,27,0,53004.7,17876.42,4763.48,467.71,2024-09-14 18:40:08,Evening


In [430]:

import requests
import pandas as pd

# Replace 'YOUR_MAPBOX_ACCESS_TOKEN' with your actual Mapbox access token
MAPBOX_ACCESS_TOKEN = 'pk.eyJ1IjoiZW1ya2VsIiwiYSI6ImNtMTIwM3FpbzB4MWQybXExMTltcTB4NTYifQ.wEA_wlHnlsWVvldP_tz85A'


In [431]:
def get_location_details(lat, long):
    # Build the URL for the Mapbox reverse geocoding API
    url = f'https://api.mapbox.com/geocoding/v5/mapbox.places/{long},{lat}.json?access_token={MAPBOX_ACCESS_TOKEN}'

    try:
        response = requests.get(url, timeout=10)  # Increased timeout
        response.raise_for_status()  # Raise an error for bad responses
        data = response.json()

        if data['features']:
            # Initialize variables to store place name and state
            place_name = ''
            state_name = ''

            # Iterate through the features to extract details
            for feature in data['features'][0]['context']:
                city_name = ''
                if feature['id'].startswith('place'):  # Check for 'place' type
                    place_name = feature['text']
                elif feature['id'].startswith('region'):  # Check for 'locality' type
                    state_name = feature['text']

            return place_name, state_name

    except requests.exceptions.RequestException as e:
        print(f"Error getting location details for lat: {lat}, long: {long}. Error: {e}")

    return '', ''

In [None]:
# Test the function with the DataFrame

place_names = []
state_names = []

head = train.head(5)
for index, row in train.iterrows():
    place, state = get_location_details(row['Latitude'], row['Longitude'])
    place_names.append(place)
    state_names.append(state)

# Assign results back to the DataFrame
train['home_location'] = place_names
train['home_state'] = state_names

# View the updated DataFrame
train.head(10)

In [None]:
# Dictionary to map cities to their states
city_to_state = {
    'Sydney': 'New South Wales',
    'Melbourne': 'Victoria',
    'Brisbane': 'Queensland',
    'Perth': 'Western Australia',
    'Adelaide': 'South Australia',
    'Hobart': 'Tasmania',
    'Canberra': 'Australian Capital Territory',
    'Darwin': 'Northern Territory'
}

# Create a new column for the state based on the city
train['transaction_state'] = train['TransactionLocation'].map(city_to_state)

In [None]:
train['transaction_in_home_state'] = train['transaction_state'] == train['home_state']
train

In [None]:
# Drop unrequired columns
train = train.drop(columns={'EmailDomain', 'TransactionTime'})
test = test.drop(columns={'EmailDomain', 'TransactionTime'})

In [None]:
# exported cleaned data to csv
train.to_csv('train_cleaned.csv')
test.to_csv('test_cleaned.csv')