In [None]:
!pip install transformers



In [None]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# For text preprocessing
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

# For deep learning and BERT
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

# For model evaluation and handling overfitting
from torch.nn.utils import clip_grad_norm_
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
df = pd.read_csv('/content/DPS DATA UPDATED.csv')

# Display basic information about the dataset
df.info()

# Check the first few rows
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Post ID             50000 non-null  int64 
 1   User ID             50000 non-null  object
 2   Timestamp           50000 non-null  object
 3   Post Content        50000 non-null  object
 4   Hashtags            50000 non-null  object
 5   Mentions            37459 non-null  object
 6   Location            50000 non-null  object
 7   Likes               50000 non-null  int64 
 8   Shares              50000 non-null  int64 
 9   Comments            50000 non-null  int64 
 10  Sentiment Label     50000 non-null  object
 11  Emotion Tags        50000 non-null  object
 12  Privacy Leak Label  50000 non-null  int64 
 13  Leak Type           37543 non-null  object
dtypes: int64(5), object(9)
memory usage: 5.3+ MB


Unnamed: 0,Post ID,User ID,Timestamp,Post Content,Hashtags,Mentions,Location,Likes,Shares,Comments,Sentiment Label,Emotion Tags,Privacy Leak Label,Leak Type
0,0,1fc095e6-9718-4776-9781-7edde03efb4b,2024-07-17T05:58:05.061467,Job challenge guess must party. Degree materia...,#hold #tonight #force #long,@carolineharvey @owensbailey @tapiajustin,Bauerberg,472,193,25,Negative,"Happy, Sad, Angry",1,Sensitive Topic
1,1,f89ac88c-e75a-4773-b6a2-5db88afd05f9,2024-03-31T17:33:22.045793,Article shoulder huge discussion training. Maj...,#loss #down #give,,North Troy,620,203,10,Neutral,Sad,0,
2,2,07b3c363-bc54-4fce-bf28-e81289b2d714,2024-08-08T00:36:51.818492,Science event dream bad institution. Education...,#must,,Brittanytown,990,298,12,Positive,Surprised,1,Financial Info
3,3,b42928fa-6704-49a1-8fb3-971f4b8ae1aa,2024-06-11T17:48:25.572808,Build may eye boy get. Individual yet feeling ...,#through #or #reach,@andersonfrank @fortiz @jacobchavez,East Josephview,957,208,31,Neutral,Happy,1,Personal Info
4,4,cd439158-ce50-4210-9844-ca530deffbe2,2024-08-29T23:19:35.066880,Big return somebody sport building dinner. Pap...,#artist,@limichael @thomas81,East Samanthaborough,368,235,92,Positive,"Happy, Sad, Happy",0,


In [None]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Fill missing values for categorical and numerical columns
df['Post Content'] = df['Post Content'].fillna('')
df['Location'] = df['Location'].fillna('Unknown')
df.fillna(0, inplace=True)

# Clean text data in 'Post Content'
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase
    return text

df['Cleaned_Post_Content'] = df['Post Content'].apply(clean_text)

# Check for class imbalance in the Privacy Leak Label
print(df['Privacy Leak Label'].value_counts())

Missing values per column:
 Post ID                   0
User ID                   0
Timestamp                 0
Post Content              0
Hashtags                  0
Mentions              12541
Location                  0
Likes                     0
Shares                    0
Comments                  0
Sentiment Label           0
Emotion Tags              0
Privacy Leak Label        0
Leak Type             12457
dtype: int64
Privacy Leak Label
1    37543
0    12457
Name: count, dtype: int64


In [None]:
# Encode target labels
label_encoder = LabelEncoder()
df['Privacy Leak Label'] = label_encoder.fit_transform(df['Privacy Leak Label'])

# One-hot encode categorical columns
categorical_columns = ['Sentiment Label', 'Emotion Tags']
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit and transform the categorical data
encoded_features = encoder.fit_transform(df[categorical_columns])

# Create a dataframe with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded features back into the main dataframe
df = pd.concat([df, encoded_df], axis=1)
df.drop(columns=categorical_columns, inplace=True)

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Post_Content'], df['Privacy Leak Label'],
                                                    test_size=0.2, random_state=42)

**Fasttext**

In [None]:
# Install FastText library
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296186 sha256=db65045227785c807f8747b7c510471cb70e18bdd5312c14fafeb4573fe32663
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [None]:
# Clean and prepare the data
df['Post Content'] = df['Post Content'].fillna('')
df['Cleaned_Post_Content'] = df['Post Content'].apply(clean_text)

# Prepare training data in the format required by FastText
df['label'] = df['Privacy Leak Label'].apply(lambda x: f'__label__{x}')
train_data = df[['label', 'Cleaned_Post_Content']].values

# Save training data to a file
with open('train.txt', 'w') as f:
    for label, content in train_data:
        f.write(f"{label} {content}\n")

In [None]:
import fasttext

# Train FastText model
model = fasttext.train_supervised('train.txt', label='__label__', epoch=25, lr=0.1, dim=50)

# Evaluate the model
def evaluate_model(model, test_data):
    correct = 0
    total = 0
    for label, content in test_data:
        # Remove newline characters from content
        content = content.replace('\n', ' ')
        predicted_label = model.predict(content)[0][0]
        correct += (predicted_label == label)
        total += 1
    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Prepare test data
test_data = df[['label', 'Cleaned_Post_Content']].values[int(len(df) * 0.8):]

# Evaluate the model on the test set
evaluate_model(model, test_data)

Accuracy: 80.21%


In [None]:
# Function to classify new posts and trigger an alert
def check_privacy_leak_fasttext(model, new_posts):
    for post in new_posts:
        # Predict the label for the new post
        predicted_label = model.predict(post)[0][0]  # Get the predicted label

        # Determine if there's a leak and its type based on the label
        if predicted_label == "__label__1":  # Assuming 1 indicates a leak
            leak_type = get_leak_type(post)
            print(f"Alert: Privacy leak detected in the post: \"{post}\". Leak type: {leak_type}")
        else:
            print(f"No privacy leak detected in the post: \"{post}\"")

# Example function to determine leak type (simplified)
def get_leak_type(post):
    if "address" in post:
        return "Address Leak"
    elif "credit card" in post:
        return "Credit Card Information Leak"
    elif "social security number" in post:
        return "Social Security Number Leak"
    elif "salary" in post:
        return "Salary Information Leak"
    elif "medical history" in post:
        return "Medical Information Leak"
    else:
        return "Unknown Leak Type"

# Example new posts to check
new_posts = [
    "I just received a call from my bank about my account balance.",
    "My social security number is 123-45-6789. Can someone help me?",
    "I told my coworker about my salary during lunch.",
    "I shared a screenshot of my online banking app on social media."
]

# Call the alert mechanism
check_privacy_leak_fasttext(model, new_posts)

Alert: Privacy leak detected in the post: "I just received a call from my bank about my account balance.". Leak type: Unknown Leak Type
Alert: Privacy leak detected in the post: "My social security number is 123-45-6789. Can someone help me?". Leak type: Social Security Number Leak
Alert: Privacy leak detected in the post: "I told my coworker about my salary during lunch.". Leak type: Salary Information Leak
Alert: Privacy leak detected in the post: "I shared a screenshot of my online banking app on social media.". Leak type: Unknown Leak Type
