In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE 

In [7]:


# Full path to the CSV file
file_path = "/Users/shivangirai/Downloads/creditcard_2023.csv"

# Load the file into a DataFrame
df = pd.read_csv(file_path)



In [8]:
df.head(10)

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0
5,5,0.025302,-0.140514,1.191138,-0.707979,0.43049,0.458973,0.61105,-0.092629,0.180811,...,-0.187739,-0.538518,-0.050465,-0.631553,-0.45648,0.25267,0.066681,0.095812,6901.49,0
6,6,1.016482,-0.397181,0.497868,-0.144463,0.331022,0.629243,0.431262,-0.134007,0.796159,...,-0.171137,-0.287017,-0.178197,-1.297597,1.182503,-0.604228,-0.198163,-0.087619,18954.45,0
7,7,-0.051306,-0.007194,1.139941,-0.87788,0.684668,0.714326,0.892615,-0.908409,0.901938,...,0.620676,-0.920426,0.03466,-1.091527,-0.742075,-0.104863,-1.382522,-2.748268,12298.23,0
8,8,-0.13068,-0.349547,0.425786,-0.760444,1.702777,2.324816,0.568968,0.0491,0.273118,...,-0.132787,-0.2847,-0.227779,2.248754,0.534846,-0.929738,-0.224385,0.24379,22052.9,0
9,9,0.058419,-0.093507,1.11727,-0.735172,0.466111,0.332371,0.683425,-0.136674,0.096409,...,-0.203634,-0.601581,-0.145082,-0.654783,-0.196621,0.226818,0.057119,0.100629,210.35,0


In [9]:
df.shape

(568630, 31)

In [10]:
# Check for missing values in each column
print(df.isnull().sum())



id        0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [11]:
# Check for duplicate rows
duplicates = df.duplicated()
print(duplicates.sum())  # Number of duplicate rows




0


In [12]:
# Check for duplicate rows
duplicates = df.duplicated()
print(duplicates.sum())  # Number of duplicate rows




0


In [13]:
# Select numerical columns to check for outliers
columns_to_check = df.select_dtypes(include=['number']).columns

# Detect and remove outliers for each column
for col in columns_to_check:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    # Define the bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out rows where the column has outliers
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print("Dataset after removing outliers:")
print(df)


Dataset after removing outliers:
            id        V1        V2        V3        V4        V5        V6  \
1            1  0.985100 -0.356045  0.558056 -0.429654  0.277140  0.428605   
4            4 -0.206820 -0.165280  1.527053 -0.448293  0.106125  0.530549   
5            5  0.025302 -0.140514  1.191138 -0.707979  0.430490  0.458973   
6            6  1.016482 -0.397181  0.497868 -0.144463  0.331022  0.629243   
9            9  0.058419 -0.093507  1.117270 -0.735172  0.466111  0.332371   
...        ...       ...       ...       ...       ...       ...       ...   
568611  568611 -0.615271  0.186434 -0.030647  0.214008 -0.253675  0.349331   
568612  568612  0.941162 -0.281207  0.502989 -0.052894  0.312189  0.337272   
568616  568616 -0.120366  0.166819 -0.270633 -0.353998  0.468713 -0.321819   
568627  568627 -0.311997 -0.004095  0.137526 -0.035893 -0.042291  0.121098   
568628  568628  0.636871 -0.516970 -0.300889 -0.144480  0.131042 -0.294148   

              V7        V8    

In [None]:
# the rows before and after removing the outliers remain the sme  which means there were no  outliers in the dataset 
# the data set is normally distribtued   ,with no cleaning , preproocessing needed .

In [15]:
# Select only numerical columns
numeric_columns = df.select_dtypes(include=['number']).columns

#corelation  heatmap 
plt.figure(figsize=(10, 8))
correlation_matrix = df[numeric_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


NameError: name 'plt' is not defined

In [None]:

# from the image  class and id have the strong corelation with value of 0.8 
# from the code , we  get the same result . 

In [16]:
import pandas as pd

# Assuming `df` is your dataset
correlation_matrix = df.corr()

# Define a threshold for strong correlation
threshold = 0.8

# Find pairs of strongly correlated columns
strong_pairs = []

# Iterate over the matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            col1 = correlation_matrix.columns[i]
            col2 = correlation_matrix.columns[j]
            strong_pairs.append((col1, col2, correlation_matrix.iloc[i, j]))

# Display strongly correlated column pairs
print("Strongly correlated column pairs (|correlation| > 0.8):")
for pair in strong_pairs:
    print(f"{pair[0]} and {pair[1]} with correlation: {pair[2]:.2f}")


Strongly correlated column pairs (|correlation| > 0.8):
Class and id with correlation: 0.84


In [17]:
#removing unnecessary columns which are not useful for the model prediction 
# Drop the 'id' column (not useful for predictions)
df = df.drop(columns=['id'])

In [18]:
df.shape

(245934, 30)

In [19]:
# Check class distribution
print("Class distribution before resampling:")
print(df['Class'].value_counts(normalize=True))

Class distribution before resampling:
Class
0    0.736222
1    0.263778
Name: proportion, dtype: float64


In [20]:

# Separate features and target variable
X = df.drop(columns=['Class'])
y = df['Class']

In [21]:
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [22]:
# check if there is class imbalance 
print(y.value_counts())


Class
0    181062
1     64872
Name: count, dtype: int64


In [None]:
# use smote- SMOTE is designed for classification tasks where the target variable has categorical classes (e.g., 0 and 1). 


In [23]:
# Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Check class distribution after resampling
print("Class distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts(normalize=True))


Class distribution after SMOTE:
Class
0    0.5
1    0.5
Name: proportion, dtype: float64


In [24]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled
)

In [25]:
#applying ml models
# Train Logistic Regression Model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

# Train Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("AUC-ROC Score:")
    print(roc_auc_score(y_test, y_proba))

# Evaluate Logistic Regression
print("Logistic Regression Evaluation:")
evaluate_model(lr_model, X_test, y_test)

# Evaluate Random Forest
print("\nRandom Forest Evaluation:")
evaluate_model(rf_model, X_test, y_test)


Logistic Regression Evaluation:
Confusion Matrix:
[[52675  1644]
 [ 2647 51672]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     54319
           1       0.97      0.95      0.96     54319

    accuracy                           0.96    108638
   macro avg       0.96      0.96      0.96    108638
weighted avg       0.96      0.96      0.96    108638

AUC-ROC Score:
0.9932178759917876

Random Forest Evaluation:
Confusion Matrix:
[[54313     6]
 [    4 54315]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     54319
           1       1.00      1.00      1.00     54319

    accuracy                           1.00    108638
   macro avg       1.00      1.00      1.00    108638
weighted avg       1.00      1.00      1.00    108638

AUC-ROC Score:
0.9999999525512798


In [27]:
# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
logistic_y_pred = logistic_model.predict(X_test)

# Random Forest Model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
rf_y_pred = random_forest_model.predict(X_test)

# Calculate and display metrics for Logistic Regression
logistic_accuracy = accuracy_score(y_test, logistic_y_pred)
logistic_conf_matrix = confusion_matrix(y_test, logistic_y_pred)
logistic_classification_report = classification_report(y_test, logistic_y_pred)
logistic_auc_roc = roc_auc_score(y_test, logistic_model.predict_proba(X_test)[:, 1])

print("Logistic Regression Evaluation:")
print(f"Accuracy: {logistic_accuracy}")
print("Confusion Matrix:")
print(logistic_conf_matrix)
print("Classification Report:")
print(logistic_classification_report)
print(f"AUC-ROC Score: {logistic_auc_roc}")

# Calculate and display metrics for Random Forest
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)
rf_classification_report = classification_report(y_test, rf_y_pred)
rf_auc_roc = roc_auc_score(y_test, random_forest_model.predict_proba(X_test)[:, 1])

print("\nRandom Forest Evaluation:")
print(f"Accuracy: {rf_accuracy}")
print("Confusion Matrix:")
print(rf_conf_matrix)
print("Classification Report:")
print(rf_classification_report)
print(f"AUC-ROC Score: {rf_auc_roc}")


NameError: name 'accuracy_score' is not defined

In [None]:
# nlp and sigmoid function is nto typically relevant  when the data is cateogorical or numerical 
# in most real life datasets/banks neural network with a siggmoid activation is used for binary classification 


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


In [28]:
# Build the Neural Network
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [32]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [2]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


NameError: name 'model' is not defined

In [33]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 419us/step - accuracy: 0.9789 - loss: 0.0537 - val_accuracy: 0.9986 - val_loss: 0.0047
Epoch 2/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 410us/step - accuracy: 0.9989 - loss: 0.0041 - val_accuracy: 0.9994 - val_loss: 0.0023
Epoch 3/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 404us/step - accuracy: 0.9993 - loss: 0.0026 - val_accuracy: 0.9985 - val_loss: 0.0037
Epoch 4/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 399us/step - accuracy: 0.9994 - loss: 0.0022 - val_accuracy: 0.9995 - val_loss: 0.0021
Epoch 5/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 399us/step - accuracy: 0.9996 - loss: 0.0016 - val_accuracy: 0.9997 - val_loss: 0.0013
Epoch 6/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 413us/step - accuracy: 0.9997 - loss: 0.0013 - val_accuracy: 0.9998 - val_loss: 0.0017
Epoc

In [34]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
y_pred_proba = model.predict(X_test).ravel()
y_pred = (y_pred_proba > 0.5).astype(int)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAUC-ROC Score:")
print(roc_auc_score(y_test, y_pred_proba))

Epoch 1/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 405us/step - accuracy: 0.9999 - loss: 5.1416e-04 - val_accuracy: 0.9999 - val_loss: 8.1176e-04
Epoch 2/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 400us/step - accuracy: 0.9999 - loss: 4.6522e-04 - val_accuracy: 0.9998 - val_loss: 0.0014
Epoch 3/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 399us/step - accuracy: 0.9999 - loss: 5.2005e-04 - val_accuracy: 0.9998 - val_loss: 0.0015
Epoch 4/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 791us/step - accuracy: 0.9999 - loss: 3.1581e-04 - val_accuracy: 0.9998 - val_loss: 0.0015
Epoch 5/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 924us/step - accuracy: 0.9999 - loss: 5.2526e-04 - val_accuracy: 0.9998 - val_loss: 0.0014
Epoch 6/20
[1m7922/7922[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 896us/step - accuracy: 0.9999 - loss: 3.9738e-04 - val_accuracy: 0.

In [35]:
# Number of people predicted to default the loan
num_defaults_predicted = np.sum(y_pred)
print(f"Number of people predicted to default the loan: {num_defaults_predicted}")


Number of people predicted to default the loan: 54336


In [36]:
# Get the indices of people who actually defaulted
defaulted_indices = np.where(y_test == 1)[0]

# Extract the corresponding rows from X_test (or the original dataset if available)
defaulted_people = X_test[defaulted_indices]

print(f"Data of people who actually defaulted:")
print(defaulted_people)



Data of people who actually defaulted:
[[-0.9017357   2.09633824 -1.62956879 ...  1.81602589  1.73059379
  -0.37512192]
 [-0.31418966  1.01995772 -0.7482041  ...  2.5603955   2.46570905
   0.72508907]
 [-0.49626944  0.76306418  0.58407262 ... -1.68186837 -2.48473857
   1.28269432]
 ...
 [-0.70939114  1.09435523 -1.47245634 ...  0.61733484 -0.64453649
  -0.72963738]
 [-0.27874117  0.73323769 -0.87454051 ...  2.57679198  2.29167165
   0.82007545]
 [-0.82870868  1.15105142 -0.78987442 ...  2.73887101  2.08568559
   0.68634565]]


In [39]:
import pandas as pd

# Assuming X_test is originally a DataFrame and has column names
defaulted_people_df = pd.DataFrame(defaulted_people, columns=X_test.columns)

# Display the DataFrame
print("Data of people who actually defaulted:")
print(defaulted_people_df)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [40]:
# If X_test doesn't have column names, add placeholders
feature_names = [f"Feature_{i+1}" for i in range(defaulted_people.shape[1])]

# Convert to DataFrame
defaulted_people_df = pd.DataFrame(defaulted_people, columns=feature_names)

# Display the DataFrame
print("Data of people who actually defaulted:")
print(defaulted_people_df)


Data of people who actually defaulted:
       Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0      -0.901736   2.096338  -1.629569   1.524466  -0.619094  -1.249584   
1      -0.314190   1.019958  -0.748204   1.656438  -1.079957  -0.370960   
2      -0.496269   0.763064   0.584073   1.100146   2.133497   1.218843   
3       0.558697   0.094483  -0.121275   0.227360   0.331024  -0.121455   
4      -1.007599   1.473444  -1.821929   1.247968  -2.285935  -0.280188   
...          ...        ...        ...        ...        ...        ...   
54314  -0.405782  -0.633361  -0.726678  -0.078958   2.118533  -0.666908   
54315  -0.658094   0.416170   0.455085   0.563390   1.078105  -0.330626   
54316  -0.709391   1.094355  -1.472456   1.239427  -0.963452  -0.266124   
54317  -0.278741   0.733238  -0.874541   1.056962  -1.052403  -0.019213   
54318  -0.828709   1.151051  -0.789874   1.636503  -1.405145   0.351638   

       Feature_7  Feature_8  Feature_9  Feature_10  ...  Fea

In [41]:
# Assuming the original dataset is called `X` and it is a DataFrame
# Replace X with your actual original DataFrame variable name

# Get the column names from the original dataset
column_names = X.columns

# Convert the extracted data of defaulted people to a DataFrame
defaulted_df = pd.DataFrame(defaulted_people, columns=column_names)

# Save to CSV or display the DataFrame
defaulted_df.to_csv("defaulted_people.csv", index=False)

# Print the DataFrame
print("Data of people who actually defaulted:")
print(defaulted_df)


Data of people who actually defaulted:
             V1        V2        V3        V4        V5        V6        V7  \
0     -0.901736  2.096338 -1.629569  1.524466 -0.619094 -1.249584 -2.289440   
1     -0.314190  1.019958 -0.748204  1.656438 -1.079957 -0.370960 -1.675038   
2     -0.496269  0.763064  0.584073  1.100146  2.133497  1.218843  1.584960   
3      0.558697  0.094483 -0.121275  0.227360  0.331024 -0.121455  0.382132   
4     -1.007599  1.473444 -1.821929  1.247968 -2.285935 -0.280188  0.397551   
...         ...       ...       ...       ...       ...       ...       ...   
54314 -0.405782 -0.633361 -0.726678 -0.078958  2.118533 -0.666908  0.130273   
54315 -0.658094  0.416170  0.455085  0.563390  1.078105 -0.330626  0.964137   
54316 -0.709391  1.094355 -1.472456  1.239427 -0.963452 -0.266124 -1.385008   
54317 -0.278741  0.733238 -0.874541  1.056962 -1.052403 -0.019213 -1.856736   
54318 -0.828709  1.151051 -0.789874  1.636503 -1.405145  0.351638 -2.215424   

            

In [37]:
# Get the indices of people predicted to default
predicted_default_indices = np.where(y_pred == 1)[0]

# Extract the corresponding rows from X_test (or the original dataset if available)
predicted_defaulted_people = X_test[predicted_default_indices]

print(f"Data of people predicted to default:")
print(predicted_defaulted_people)


Data of people predicted to default:
[[-0.9017357   2.09633824 -1.62956879 ...  1.81602589  1.73059379
  -0.37512192]
 [-0.31418966  1.01995772 -0.7482041  ...  2.5603955   2.46570905
   0.72508907]
 [-0.49626944  0.76306418  0.58407262 ... -1.68186837 -2.48473857
   1.28269432]
 ...
 [-0.70939114  1.09435523 -1.47245634 ...  0.61733484 -0.64453649
  -0.72963738]
 [-0.27874117  0.73323769 -0.87454051 ...  2.57679198  2.29167165
   0.82007545]
 [-0.82870868  1.15105142 -0.78987442 ...  2.73887101  2.08568559
   0.68634565]]


In [38]:
import pandas as pd

# Convert extracted data to DataFrame (assume X_test is a DataFrame)
defaulted_df = pd.DataFrame(defaulted_people, columns=X_test.columns)

# Save to CSV
defaulted_df.to_csv("defaulted_people.csv", index=False)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'