In [2]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
final_data_short = pd.read_csv('final_data_short_sample.csv')
data_labels = pd.read_csv('labelled data.csv')

# Preprocess the dataset
final_data_short.fillna(0, inplace=True)  # Fill missing values

# Ensure indices align properly
final_data_short.set_index('consumer_number', inplace=True)
data_labels.set_index('consumer_number', inplace=True)

# Extract the features for the labeled data
X_labeled = final_data_short.loc[data_labels.index]

# Normalize the features
scaler = StandardScaler()
X_labeled_normalized = scaler.fit_transform(X_labeled)

# Reshape input for RNN: [samples, time steps, features]
# Assuming each row is a separate time step
X_labeled_normalized = X_labeled_normalized.reshape((X_labeled_normalized.shape[0], 1, X_labeled_normalized.shape[1]))

y_labeled = data_labels['fraud_status']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_labeled_normalized, y_labeled, test_size=0.2, random_state=42)

# Build a simpler RNN model
model = Sequential()
model.add(SimpleRNN(20, activation='relu', input_shape=(1, X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')


: 

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('final_data_short_sample.csv')
delta = 2.0

consumer_numbers = data['consumer_number'].values
data = data.drop(columns=['consumer_number'])

score = np.zeros(len(data))
pos = np.arange(len(data))

for i in range(len(data)):
    mu_h = data.iloc[i].mean()
    sigma_h = data.iloc[i].std()
    zh_d = (data.iloc[i] - mu_h) / sigma_h
    label_h_d = zh_d < (-delta * sigma_h)
    day_level_score = np.sum(label_h_d)
    score[i] = day_level_score

pos = np.argsort(score)

result_df = pd.DataFrame({'ConsumerNumber': consumer_numbers, 'Rank': pos})
result_df = result_df.sort_values(by='Rank', ascending=True)

size = int(np.round(0.1 * result_df.shape[0]))
defaulter_list = result_df.head(size)['ConsumerNumber'].tolist()

data = pd.read_csv('final_data_short_sample.csv')
data['fraud_status'] = 0
data.loc[data['consumer_number'].isin(defaulter_list), 'fraud_status'] = 1
data[['consumer_number','fraud_status']].to_csv('labelled data.csv',index=False)
