In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import re
import string

In [30]:
# Load your dataset
df = pd.read_csv('fraud.csv')

In [31]:
# Concatenate text from caption, bio, and post_content
df['combined_text'] = df['caption'] + ' ' + df['bio'] + ' ' + df['post_content']

# Drop the original columns
df = df.drop(['caption', 'bio', 'post_content'], axis=1)

train_data, test_data, train_labels, test_labels = train_test_split(
    df['combined_text'], df['fraud_rating'], test_size=0.2
)

In [32]:
# Custom transformer to clean text
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Convert to lowercase
        X = X.apply(lambda x: x.lower())
        # Remove punctuation
        X = X.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        # Remove extra whitespaces
        X = X.apply(lambda x: re.sub(' +', ' ', x))
        return X

In [33]:
# Build the model pipeline with hyperparameter tuning
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(ENGLISH_STOP_WORDS))),  # Convert to list
    ('model', LogisticRegression())
])


In [34]:

# Fit the model
model.fit(train_data, train_labels)

# Save the model to a file
joblib.dump(model, 'fraud_detection_model.joblib')


['fraud_detection_model.joblib']

In [35]:

# Make predictions on the test data
predictions = model.predict(test_data)

# Calculate and print accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 95.24%
