In [1]:
import pandas as pd
import numpy as np
import spacy
import re

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
import torch
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [5]:
model = DistilBertForSequenceClassification.from_pretrained('modeldoxing_distilBert/model', local_files_only=True)

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [8]:
def pre_process(tweet):
    tweet = tweet.replace('\n',' ')
    tweet = tweet.lower()
    return tweet

In [9]:
def to_check_result(test_encoding) :
    input_ids = torch.tensor(test_encoding["input_ids"]).to(device)
    attention_mask = torch.tensor(test_encoding["attention_mask"]).to(device)
    with torch.no_grad() :
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        y = np.argmax(outputs[0].to('cpu').numpy())
        
        return y

In [10]:
ip_pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
url_pattern = re.compile("((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)", re.DOTALL)

In [11]:
def anon_ip(twt):
    if ip_pattern.search(twt) != None:
        ips = re.findall(ip_pattern, twt)
        for ip in ips:
            twt = twt.replace(ip, 5*'X')
        return twt
    else:
        return twt

def anon_url(twt):
    if url_pattern.search(twt) != None:
        links = re.findall(url_pt, inp)
        for lnk in links:
            twt = twt.replace(lnk[0], 5*'X')
            
        return twt
    else:
        return twt
    
def generic_ner(twt):
    doc = nlp(twt)
    for ent in doc.ents:
        twt = twt[0:ent.start_char] + 5 * 'X'+  twt[ent.end_char:]
    return twt

In [12]:
def anonymize(org_twt):
    if ip_pattern.search(org_twt) == None and ip_pattern.search(org_twt) == None:
        return org_twt
    else:
        stp1 = anon_ip(org_twt)
        stp2 = anon_url(stp1)
        sanitized_twt = generic_ner(stp2)
        
        return sanitized_twt

In [13]:
def doxing_test(twt):
    twt = pre_process(twt)
    test_encoding = tokenizer(twt, truncation=True, padding=True)
    input_ids = torch.tensor(test_encoding["input_ids"]).to(device)
    attention_mask = torch.tensor(test_encoding["attention_mask"]).to(device)
    op = to_check_result(test_encoding)        
    return op
        

In [17]:
input_ = str(input("What's happening?....... \nEveryone can reply.\nEnter your tweet:\n"))
op = doxing_test(input_)
if op == 0:
    print('\n')
    print("It's not a doxed tweet")
else:
    print('\n')
    print("It's recommended to not post this tweet. It might be against Twitter's Policy.")
    print("Twitter’s private information and media policy - “You may not publish or post other people's private information (such as home phone number and address) without their express authorization and \permission. We also prohibit threatening to expose private information or incentivizing others to do so.”")
    print("For more information, please visit https://help.twitter.com/en/rules-and-policies/personal-information")
if op == 1:
    nudge = str(input("Do you want to continue posting? Y/N"))
    if (nudge == 'N') or (nudge == 'n'):
        print("\nThe draft has been discarded.")
    elif (nudge == 'Y') or (nudge == 'y'):
        print('\n')
        print(anonymize(input_))

What's happening?....... 
Everyone can reply.
Enter your tweet:
@wife34_ Helllo. \nFull name: Martha Benjamin\nAddress: 14 Pug Street, Klon, US\nIP address: 12.18.0.16\nCordinates: 11.5033\u00b0 N, 0.1096\u00b0 W


It's recommended to not post this tweet. It might be against Twitter's Policy.
Twitter’s private information and media policy - “You may not publish or post other people's private information (such as home phone number and address) without their express authorization and \permission. We also prohibit threatening to expose private information or incentivizing others to do so.”
For more information, please visit https://help.twitter.com/en/rules-and-policies/personal-information
Do you want to continue posting? Y/Ny


@wife34_ Helllo. \nFull name: XXXXX Benjamin\nAddress: 1XXXXXXXXet, KloXXXXXSXXXXXddress: XXXXX\nCordinates: 11.XXXXX.1096\u00b0 XXXXX
