In [17]:
import pandas as pd
df = pd.read_csv("creditcard.csv")
print(df.head())


   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 

In [None]:

print(df.isnull().sum())
print(df.info())
print(df['is_fraud'].value_counts())


Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null

In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("Numeric Columns:", list(numeric_cols))
print("Categorical Columns:", list(categorical_cols))


Numeric Columns: ['Unnamed: 0', 'cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']
Categorical Columns: ['trans_date_trans_time', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num']


In [None]:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])
print(df.isnull().sum())


Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [None]:
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
for col in categorical_cols:
    df[col] = label_enc.fit_transform(df[col])
print("Categorical encoding completed!")


Categorical encoding completed!


In [22]:
print(df.columns)


Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [None]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
df['month'] = df['trans_date_trans_time'].dt.month
print("Datetime features extracted successfully!")


Datetime features extracted successfully!


In [None]:
df.drop(columns=['Unnamed: 0', 'first', 'last', 'street', 'city', 'trans_num', 'cc_num', 'dob'], inplace=True)
print("Dropped unnecessary columns successfully!")
print(df.columns)


Dropped unnecessary columns successfully!
Index(['trans_date_trans_time', 'merchant', 'category', 'amt', 'gender',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'hour', 'day_of_week', 'month'],
      dtype='object')


In [None]:
print(df['is_fraud'].value_counts())


is_fraud
0    553574
1      2145
Name: count, dtype: int64


In [None]:
print(df.dtypes)


trans_date_trans_time    datetime64[ns]
merchant                          int64
category                          int64
amt                             float64
gender                            int64
state                             int64
zip                               int64
lat                             float64
long                            float64
city_pop                          int64
job                               int64
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
hour                              int32
day_of_week                       int32
month                             int32
dtype: object


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
df = df.drop(columns=['trans_date_trans_time'])
X = df.drop(columns=['is_fraud'])
y = df['is_fraud'] 
smote = SMOTE(sampling_strategy=0.5, random_state=42)  
X_resampled, y_resampled = smote.fit_resample(X, y)
print("Dataset balanced successfully! üöÄ")
print("New class distribution:", y_resampled.value_counts())


Dataset balanced successfully! üöÄ
New class distribution: is_fraud
0    553574
1    276787
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
X = X_resampled
y = y_resampled
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Set Size: {X_train.shape}")
print(f"Testing Set Size: {X_test.shape}")


Training Set Size: (664288, 16)
Testing Set Size: (166073, 16)


In [29]:
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier  
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  


In [None]:
lr_model = LogisticRegression(max_iter=500)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

nb_model = GaussianNB()


In [None]:
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
adaboost_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)
print("All models trained successfully! ‚úÖ")


All models trained successfully! ‚úÖ


In [None]:
voting_model = VotingClassifier(
    estimators=[
        ('lr', lr_model),
        ('rf', rf_model),
        ('adaboost', adaboost_model),
        ('gb', gb_model),
        ('nb', nb_model)
    ],
    voting='hard' 
)
voting_model.fit(X_train, y_train)
print("Voting classifier trained successfully! üöÄ")


Voting classifier trained successfully! üöÄ


In [None]:
y_pred = voting_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Voting Classifier Accuracy: 0.9140
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94    110946
           1       0.95      0.78      0.86     55127

    accuracy                           0.91    166073
   macro avg       0.93      0.88      0.90    166073
weighted avg       0.92      0.91      0.91    166073

Confusion Matrix:
 [[108657   2289]
 [ 12000  43127]]


In [None]:
import pickle
with open("voting_model.pkl", "wb") as model_file:
    pickle.dump(voting_model, model_file)

print("Trained model saved successfully! üéØ")


Trained model saved successfully! üéØ


In [None]:
import pandas as pd
new_transaction = pd.DataFrame([{
    'merchant': 10, 
    'category': 3,  
    'amt': 120.50,
    'gender': 1,    
    'state': 5,     
    'zip': 12345,
    'lat': 40.7128,
    'long': -74.0060,
    'city_pop': 8500000,
    'job': 7,       
    'unix_time': 1.7e9,  
    'merch_lat': 40.7306,
    'merch_long': -73.9352,
    'hour': 14,
    'day_of_week': 3,
    'month': 6
}])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
new_transaction[['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'hour', 'day_of_week', 'month']] = scaler.fit_transform(
    new_transaction[['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'hour', 'day_of_week', 'month']])

print("‚úÖ New Transaction Ready for Prediction!")


‚úÖ New Transaction Ready for Prediction!


In [None]:
import numpy as np
import pandas as pd
import pickle
with open("voting_model.pkl", "rb") as file:
    voting_model = pickle.load(file)

print("‚úÖ Model Loaded Successfully!")


‚úÖ Model Loaded Successfully!


In [None]:
prediction = voting_model.predict(new_transaction)
if prediction[0] == 1:
    print("üö® ALERT: Fraudulent Transaction Detected! üö®")
else:
    print("‚úÖ Transaction is Legitimate.")


‚úÖ Transaction is Legitimate.


In [None]:
import pickle
with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("‚úÖ Scaler Saved!")


‚úÖ Scaler Saved!


In [39]:
with open("voting_model.pkl", "wb") as model_file:
    pickle.dump(voting_model, model_file)

print("‚úÖ Model Saved!")


‚úÖ Model Saved!


In [None]:
with open("voting_model.pkl", "rb") as file:
    voting_model = 
    pickle.load(file)
with open("scaler.pkl", "rb") as scaler_file:
    scaler = pickle.load(scaler_file)

print("‚úÖ Model and Scaler Loaded Successfully!")


‚úÖ Model and Scaler Loaded Successfully!


In [41]:
import pandas as pd
import numpy as np

# Generate multiple test transactions (modify as needed)
new_transactions = pd.DataFrame([
    {
        'merchant': np.random.randint(1, 20),
        'category': np.random.randint(1, 10),
        'amt': round(np.random.uniform(1, 5000), 2),
        'gender': np.random.choice([0, 1]),
        'state': np.random.randint(1, 50),
        'zip': np.random.randint(10000, 99999),
        'lat': round(np.random.uniform(-90, 90), 6),
        'long': round(np.random.uniform(-180, 180), 6),
        'city_pop': np.random.randint(1000, 10000000),
        'job': np.random.randint(1, 20),
        'unix_time': np.random.randint(1.5e9, 1.8e9),
        'merch_lat': round(np.random.uniform(-90, 90), 6),
        'merch_long': round(np.random.uniform(-180, 180), 6),
        'hour': np.random.randint(0, 24),
        'day_of_week': np.random.randint(0, 7),
        'month': np.random.randint(1, 12)
    }
    for _ in range(10)  # Generate 10 random transactions
])

# Scale the numerical features using the preloaded scaler
numerical_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'hour', 'day_of_week', 'month']
new_transactions[numerical_features] = scaler.transform(new_transactions[numerical_features])

print("‚úÖ New Transactions Ready for Prediction!")


‚úÖ New Transactions Ready for Prediction!


In [42]:
# Predict fraud or not
predictions = voting_model.predict(new_transactions)

# Add predictions to the DataFrame
new_transactions['fraud_prediction'] = predictions

# Display results
print(new_transactions[['amt', 'category', 'fraud_prediction']])

# Count fraud vs non-fraud cases
fraud_count = (new_transactions['fraud_prediction'] == 1).sum()
legit_count = (new_transactions['fraud_prediction'] == 0).sum()

print(f"üö® Fraudulent Transactions Detected: {fraud_count}")
print(f"‚úÖ Legitimate Transactions: {legit_count}")


       amt  category  fraud_prediction
0  3014.40         1                 0
1  3644.16         6                 0
2  3044.82         2                 0
3  1755.35         5                 0
4  3505.08         5                 0
5  1536.49         8                 0
6  3561.55         1                 0
7   -84.15         2                 0
8  1420.16         3                 1
9    57.13         9                 0
üö® Fraudulent Transactions Detected: 1
‚úÖ Legitimate Transactions: 9


In [None]:
real_transactions = X_test.sample(10, random_state=42)  # Random selection
real_labels = y_test.loc[real_transactions.index]  # Get actual fraud labels


In [None]:
real_predictions = voting_model.predict(real_transactions)
real_transactions['predicted_fraud'] = real_predictions
real_transactions['actual_fraud'] = real_labels
print(real_transactions[['amt', 'category', 'predicted_fraud', 'actual_fraud']])


               amt  category  predicted_fraud  actual_fraud
221390   11.120000         1                0             0
563007  545.095319         6                1             1
807610  114.101080         4                0             1
203865    3.940000        12                0             0
374963   27.960000        10                0             0
219961    6.880000        11                0             0
496519    2.650000         8                0             0
93985    74.200000         2                0             0
757062  330.312604         4                1             1
457034   78.710000         5                0             0


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
accuracy = accuracy_score(real_labels, real_predictions)
precision = precision_score(real_labels, real_predictions)
recall = recall_score(real_labels, real_predictions)
f1 = f1_score(real_labels, real_predictions)

print(f"‚úÖ Model Evaluation on Real Transactions")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(real_labels, real_predictions))


‚úÖ Model Evaluation on Real Transactions
Accuracy: 0.9000
Precision: 1.0000
Recall: 0.6667
F1 Score: 0.8000

Confusion Matrix:
[[7 0]
 [1 2]]


In [50]:
import pandas as pd

# Select 10 random real transactions from the test set
real_transactions = X_test.sample(10, random_state=42)  

# Get actual fraud labels
real_labels = y_test.loc[real_transactions.index]  


In [51]:
# Make predictions
real_predictions = voting_model.predict(real_transactions)

# Add predictions to DataFrame
real_transactions['predicted_fraud'] = real_predictions
real_transactions['actual_fraud'] = real_labels

# Display results
print(real_transactions[['amt', 'category', 'predicted_fraud', 'actual_fraud']])


               amt  category  predicted_fraud  actual_fraud
221390   11.120000         1                0             0
563007  545.095319         6                1             1
807610  114.101080         4                0             1
203865    3.940000        12                0             0
374963   27.960000        10                0             0
219961    6.880000        11                0             0
496519    2.650000         8                0             0
93985    74.200000         2                0             0
757062  330.312604         4                1             1
457034   78.710000         5                0             0


In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluate model performance
accuracy = accuracy_score(real_labels, real_predictions)
precision = precision_score(real_labels, real_predictions)
recall = recall_score(real_labels, real_predictions)
f1 = f1_score(real_labels, real_predictions)

print(f"‚úÖ Model Evaluation on Real Transactions")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(real_labels, real_predictions))


‚úÖ Model Evaluation on Real Transactions
Accuracy: 0.9000
Precision: 1.0000
Recall: 0.6667
F1 Score: 0.8000

Confusion Matrix:
[[7 0]
 [1 2]]


In [53]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# ‚úÖ Step 1: Select 10 new test transactions (excluding previous ones)
new_real_transactions = X_test.sample(10, random_state=99)  # Change random_state to get different transactions
new_real_labels = y_test.loc[new_real_transactions.index]  # Get actual fraud labels

# ‚úÖ Step 2: Make predictions on new transactions
new_predictions = voting_model.predict(new_real_transactions)

# ‚úÖ Step 3: Add predictions to DataFrame
new_real_transactions['predicted_fraud'] = new_predictions
new_real_transactions['actual_fraud'] = new_real_labels

# ‚úÖ Step 4: Display results
print("üîç **Testing on New Transactions:**")
print(new_real_transactions[['amt', 'category', 'predicted_fraud', 'actual_fraud']])

# ‚úÖ Step 5: Evaluate Model Performance on New Transactions
accuracy = accuracy_score(new_real_labels, new_predictions)
precision = precision_score(new_real_labels, new_predictions)
recall = recall_score(new_real_labels, new_predictions)
f1 = f1_score(new_real_labels, new_predictions)

print("\n‚úÖ **New Model Evaluation on Real Transactions:**")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# ‚úÖ Step 6: Display Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(new_real_labels, new_predictions))


üîç **Testing on New Transactions:**
               amt  category  predicted_fraud  actual_fraud
548118   58.630000        10                0             0
673717    8.471464         2                0             1
673457  818.010824        10                1             1
515193   50.730000         6                0             0
826881  514.818899        11                1             1
373923   61.500000         2                0             0
132131   66.180000         2                0             0
428518   37.610000         6                0             0
440374    4.850000        13                0             0
474573  180.830000         4                0             0

‚úÖ **New Model Evaluation on Real Transactions:**
Accuracy: 0.9000
Precision: 1.0000
Recall: 0.6667
F1 Score: 0.8000

Confusion Matrix:
[[7 0]
 [1 2]]
