In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline  # Correct import here
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score #add accuracy_score to the import statement

# Load the dataset
df = pd.read_csv('/content/synthetic_privacy_leak_data_adjusted.csv')


In [None]:

# Preprocess the text data
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['Processed Content'] = df['Post Content'].apply(preprocess_text)

# Split dataset into features and labels
X = df['Processed Content']
y = df['Privacy Leak Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Confusion Matrix:
[[ 1451   127]
 [    0 13422]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      1578
           1       0.99      1.00      1.00     13422

    accuracy                           0.99     15000
   macro avg       1.00      0.96      0.98     15000
weighted avg       0.99      0.99      0.99     15000



In [None]:
from transformers import pipeline as hf_pipeline  # Import correctly
from sklearn.metrics import classification_report, confusion_matrix

# Load a pre-trained model for text classification
classifier = hf_pipeline('text-classification', model='bert-base-uncased', tokenizer='bert-base-uncased')

# Function to predict privacy leaks
def predict_leak(texts):
    predictions = classifier(texts)
    return [1 if pred['label'] == 'LABEL_1' else 0 for pred in predictions]

# Make predictions on the test set
y_pred_bert = predict_leak(X_test.tolist())

# Evaluate the predictions
print("Confusion Matrix (BERT):")
print(confusion_matrix(y_test, y_pred_bert))
print("\nClassification Report (BERT):")
print(classification_report(y_test, y_pred_bert))

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred_bert)
print(f"\nAccuracy (BERT): {accuracy:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Confusion Matrix (BERT):
[[    1  1577]
 [    2 13420]]

Classification Report (BERT):
              precision    recall  f1-score   support

           0       0.33      0.00      0.00      1578
           1       0.89      1.00      0.94     13422

    accuracy                           0.89     15000
   macro avg       0.61      0.50      0.47     15000
weighted avg       0.84      0.89      0.85     15000


Accuracy (BERT): 0.8947


In [None]:
# Alert mechanism based on BERT predictions
def alert_mechanism(post_content, threshold=0.5):
    prediction = predict_leak([post_content])[0]

    if prediction == 1:
        print(f"ALERT: Privacy leak detected in the post - '{post_content}'")
    else:
        print(f"No privacy leak detected in the post - '{post_content}'")

# Simulating real-time post monitoring
def monitor_posts(posts):
    import time # import the time module
    for post in posts:
        alert_mechanism(post)
        time.sleep(1)  # Simulate real-time monitoring with a short delay

# Example usage: List of social media posts to monitor
posts_to_monitor = [
    "Just shared my bank account number on Twitter!",
    "Had a great time at the beach with friends!",
    "Here's my SSN: 123-45-6789.",
    "Happy to announce my new job at Tech Corp!"
]

# Start monitoring posts and alert if any privacy leaks are detected
monitor_posts(posts_to_monitor)

ALERT: Privacy leak detected in the post - 'Just shared my bank account number on Twitter!'
ALERT: Privacy leak detected in the post - 'Had a great time at the beach with friends!'
ALERT: Privacy leak detected in the post - 'Here's my SSN: 123-45-6789.'
ALERT: Privacy leak detected in the post - 'Happy to announce my new job at Tech Corp!'
