**Keyword Analysis**

In [None]:
import re
import requests

# List of scam-related keywords and phrases
scam_keywords = [
    "get rich quick",
    "guaranteed income",
    "miracle cure",
    "you have won",
    "",
    # Add more keywords/phrases as needed
]

def check_for_scam_keywords(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html_content = response.text

            # Search for scam keywords in the HTML content
            for keyword in scam_keywords:
                matches = re.findall(keyword, html_content, re.IGNORECASE)
                if matches:
                    print(f"Found '{keyword}' on the webpage. This could indicate scam content.")

        else:
            print(f"Failed to fetch content. Status code: {response.status_code}")

    except requests.RequestException as e:
        print(f"An error occurred: {e}")
    print("No suspicious found")

# Replace with the URL of the website you want to check for scam-related keywords
check_for_scam_keywords('https://www.google.com/')


Found '' on the webpage. This could indicate scam content.
No suspicious found


**Domain age and history
 analysis**

In [None]:
import datetime
import requests

def get_domain_creation_date(domain):
  """
  Attempts to estimate the domain creation date using various methods.

  Args:
    domain: The domain name to check.

  Returns:
    A datetime object representing the estimated domain creation date, or None if unsuccessful.
  """

  # 1. Check IANA domain creation date (if available)
  try:
    url = f"https://icann.org/whois/iaa/lookup?q={domain}"
    response = requests.get(url)
    data = response.json()
    if "created" in data["domain"]:
      created_date = datetime.datetime.strptime(data["domain"]["created"][0], "%Y-%m-%dT%H:%M:%SZ")
      return created_date
  except Exception as e:
    print(f"Error fetching IANA data for {domain}: {e}")

  # 2. Check DNS record for domain age
  try:
    domain_age = None
    for record_type in ["SOA", "NS"]:
      response = requests.get(f"https://dns.google.com/resolve?name={domain}&type={record_type}")
      data = response.json()
      for record in data["Answer"]:
        if "ttl" in record:
          domain_age = max(domain_age or 0, int(record["ttl"]) / 86400)  # Convert TTL to days
    if domain_age:
      return datetime.datetime.utcnow() - datetime.timedelta(days=domain_age)
  except Exception as e:
    print(f"Error fetching DNS record for {domain}: {e}")

  # 3. Fallback to current date (indicates no reliable data found)
  return datetime.datetime.utcnow()

# Example usage
domain = "https://developers.google.com/safe-browsing/v4"
estimated_creation_date = get_domain_creation_date(domain)

if estimated_creation_date:
  print(f"Estimated creation date for {domain}: {estimated_creation_date}")
else:
  print(f"Unable to determine creation date for {domain}")

Error fetching IANA data for https://developers.google.com/safe-browsing/v4: Expecting value: line 1 column 1 (char 0)
Error fetching DNS record for https://developers.google.com/safe-browsing/v4: 'Answer'
Estimated creation date for https://developers.google.com/safe-browsing/v4: 2023-12-19 12:53:04.451813


**Code for grammatical errors scam words types**

In [None]:
pip install language_tool_python

Collecting language_tool_python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language_tool_python
Successfully installed language_tool_python-2.7.1


In [None]:
import requests
import language_tool_python
from bs4 import BeautifulSoup

# Load a language model for grammar checking
tool = language_tool_python.LanguageTool('en-US')

# Function to fetch and parse a website's text content
def get_website_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    return text

# Function to check for grammatical errors
def check_grammar(text):
    matches = tool.check(text)
    errors = [match.ruleId for match in matches]
    return errors

# Function to check for potential scam words
def check_scam_words(text):
    scam_words = ["urgent", "guarantee", "risk-free", "secret", "limited offer","al qaeda", "old glory","one of several","act now"]  # Customize this list
    found_words = [word for word in scam_words if word in text.lower()]
    return found_words

# Example usage
url = "https://www.politifact.com/factchecks/2020/jun/17/facebook-posts/cnn-did-not-lighten-photo-man-accused-driving-seat/"  # Replace with the website you want to analyze
text = get_website_text(url)

grammar_errors = check_grammar(text)
scam_words = check_scam_words(text)

print("Grammar Errors:", grammar_errors)
print("Potential Scam Words:", scam_words)

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:05<00:00, 40.3MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp8ein51_2.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /root/.cache/language_tool_python.


Grammar Errors: ['MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MISSING_COMMA_AFTER_YEAR', 'MORFOLOGIK_RULE_EN_US', 'DOUBLE_PUNCTUATION', 'MISSING_COMMA_AFTER_YEAR', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'DOUBLE_HYPHEN', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'EN_SPECIFIC_CASE', 'MISSING_COMMA_AFTER_YEAR', 'MORFOLOGIK_RULE_EN_US', 'MISSING_COMMA_AFTER_YEAR', 'MISSING_COMMA_AFTER_YEAR', 'MISSING_COMMA_AFTER_YEAR', 'MORFOLOGIK_RULE_EN_US', 'MISSING_COMMA_AFTER_YEAR', 'MISSING_COMMA_AFTER_YEAR', 'MISSING_COMMA_AFTER_YEAR', 'MISSING_COMMA_AFTER_YEAR', 'MISSING_COMMA_AFTER_YEAR', 'MORFOLOGIK_RULE_EN_US', 'MISSING_COMMA_AFTER_YEAR', 'MORFOLOGIK_RULE_EN_US', 'MISSING_COMMA_AFTER_YEAR', 'EN_UNPAIRED_BRACKETS', 'EN_UNPAIRED_BRACKETS', 'MORFOLOGIK_RULE_EN_US', 'MORFOLOGIK_RULE_EN_US', 'MISSING_COM

**URL length checking**

In [None]:
import re

def check_url_complexity(url):
   """Checks URL length and complexity, flagging potential issues."""

   max_length = 100  # Adjust as needed
   unusual_chars = r"[^a-zA-Z0-9.\-_~:/?#[\]@!$&'()*+,;=%]"
   max_hyphens = 3  # Adjust as needed

   if len(url) > max_length:
       print("Warning: URL length exceeds recommended limit.")

   if re.search(unusual_chars, url):
       print("Warning: URL contains unusual characters.")

   if url.count("-") > max_hyphens:
       print("Warning: URL contains excessive hyphens.")

   if not re.match(r"https?://", url):
       print("Error: URL does not start with http:// or https://")

   # Additional checks for complexity (consider refining):
   if len(url.split("/")) > 5:  # Excessive path segments
       print("Warning: URL has a complex path structure.")
   if len(url.split("?")[-1].split("&")) > 5:  # Excessive query parameters
       print("Warning: URL has a large number of query parameters.")


# Example usage:
url = "https://www.vadapaav.com/input/if-you?see-a-muslim-at-the-airport?kill-him-at-the?first-sight/us_134i?568790kl5?lhdni?nk2l4h/i-love-guvi@!$&/'()*+,;=%"
check_url_complexity(url)




In [None]:
import re

def check_url(url):
    # Check URL length
    if len(url) > 100:  # Adjust the length threshold as needed
        print("Warning: Excessively long URL")

    # Check for unusual characters or multiple hyphens
    if re.search(r'[^a-zA-Z0-9-:/._]', url):
        print("Warning: Unusual characters found in the URL")

    if re.search(r'[-]{2,}', url):
        print("Warning: Multiple consecutive hyphens found in the URL")

# Example fake URLs (replace with actual URLs to test)
fake_urls = [
    "https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302",
    "https://www.huffingtonpost.com/entry/if-you-see-a-muslim-at-the-airport_us_588ddf13e4b0cd25e49049d8",
    "https://www.udemy.com/"
]

for url in fake_urls:
    print("Checking URL:", url)
    check_url(url)

Checking URL: https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302
Checking URL: https://www.huffingtonpost.com/entry/if-you-see-a-muslim-at-the-airport_us_588ddf13e4b0cd25e49049d8
Checking URL: https://www.udemy.com/


**Broken links and functionality: Malfunctioning features, broken links, or missing pages can point to poorly maintained websites or scams.**

In [None]:
import urllib.parse

def check_suspicious_url(url):
    parsed_url = urllib.parse.urlparse(url)

    # Check for suspicious patterns in domain name
    if "-bank" in parsed_url.netloc or ".bank." in parsed_url.netloc:
        return True
    if parsed_url.netloc.endswith("-login") or parsed_url.netloc.endswith("-account"):
        return True
    if parsed_url.netloc.count("-") > 2:
        return True

    # Check for suspicious subdomains
    if parsed_url.netloc.startswith("www."):
        subdomain = parsed_url.netloc.split("www.")[1]
        if subdomain.startswith("secure-") or subdomain.startswith("login-"):
            return True

    # Check for suspicious paths
    if "-secure" in parsed_url.path or "-login" in parsed_url.path:
        return True

    # Check for suspicious query parameters
    if parsed_url.query and ("account" in parsed_url.query or "password" in parsed_url.query):
        return True

    # Check for top-level domains (TLDs) commonly used in scams
    if parsed_url.netloc.endswith(".xyz") or parsed_url.netloc.endswith(".club"):
        return True

    return False

# Example usage
url =  "https://www.vadapaav.com/input/if-you-see-a-muslim-at-the-airport-kill-him-at-the-first-sight_us_134ib568790kl56lhdni5nk2l4h_i-love-guvi"
if check_suspicious_url(url):
    print("The URL is potentially fake or a scam.")
else:
    print("The URL doesn't appear to be suspicious based on its structure.")


The URL doesn't appear to be suspicious based on its structure.


In [None]:
import requests

def check_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            print(f"URL {url} is working fine!")
        else:
            print(f"Broken link detected! URL: {url}, Status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Failed to connect to URL {url}. Error: {e}")

# Replace these URLs with the ones you want to check
urls_to_check = [
    'https://www.example.com',
    'https://www.example.com/brokenlink',  # Example broken link
    'https://www.example.com/missingpage', # Example missing page
]

for url in urls_to_check:
    check_url(url)


URL https://www.example.com is working fine!
Broken link detected! URL: https://www.example.com/brokenlink, Status code: 404
Broken link detected! URL: https://www.example.com/missingpage, Status code: 404


**Excessive pop-ups and ads: Intrusive pop-ups, aggressive advertising, or redirects to unrelated websites might indicate malicious intent.**

In [None]:
pip install selenium


Collecting selenium
  Downloading selenium-4.16.0-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.23.2-py3-none-any.whl (461 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m461.6/461.6 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?

In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Set up the Chrome browser
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-popup-blocking")  # Disable popup blocking
driver = webdriver.Chrome(chrome_options=chrome_options)

# Replace 'https://example.com' with the website URL you want to check
url = 'https://example.com'

# Navigate to the website
driver.get(url)

# Wait for page load
wait = WebDriverWait(driver, 10)

# Check for presence of pop-ups
pop_ups = wait.until(EC.number_of_windows_to_be(2))  # Check if more than one window is open

if pop_ups:
    print("This website has excessive pop-ups or redirects.")

# Check for intrusive elements (you might need to customize this based on the site's behavior)
intrusive_elements = driver.find_elements(By.XPATH, "//*[contains(@class, 'intrusive-class-name')]")

if intrusive_elements:
    print("This website contains intrusive elements.")

# You might need additional checks based on specific indicators of aggressive advertising or redirects

# Close the browser
driver.quit()


TypeError: ignored

**Missing security certificates: Ensure the website uses HTTPS with a valid SSL certificate for secure communication.**

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException

# Function to check SSL certificate presence

chrome_options = Options
driver_path = '/content/chromedriver.exe'
service = Service(driver_path)

# driver = webdriver.Chrome() #service=service, options=chrome_options
driver = webdriver.Chrome(executable_path = driver_path)
def check_ssl_certificate(url):
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run browser in headless mode

        # Set the path to your browser driver


        # Initialize the Chrome browser


        # Navigate to the URL
        driver.get(url)

        # Get the current URL after redirection (to handle possible redirects)
        final_url = driver.current_url

        # Check if the URL uses HTTPS
        if final_url.startswith('https'):
            print(f"The website {url} uses HTTPS.")
        else:
            print(f"The website {url} does not use HTTPS.")

    except WebDriverException as e:
        print(f"Error: {e}")
    finally:
        driver.quit()

# Example usage
website_url = "https://dictionary.cambridge.org/dictionary/english/book"  # Replace with the URL you want to check
check_ssl_certificate(website_url)


TypeError: ignored

**NLP model development: Train NLP models for text classification, entity recognition,
sentiment analysis, and fact-checking.**

**Text Classification**

In [None]:
# train_test_split

from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = 24)
X_train.shape, X_test.shape


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Sample data (replace with your labeled data)
texts = ["this is cute"]  # List of texts
labels = ["this is book"]  # List of corresponding labels



# Convert texts to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Support Vector Machine (SVM) classifier
clf = SVC(kernel='linear')
clf.fit(X_train_vectorized, y_train)

# Evaluate the classifier
accuracy = clf.score(X_test_vectorized, y_test)
print(f"Accuracy: {accuracy}")


NameError: ignored

**Entity Recognition**

In [None]:
import spacy
from spacy.training import Example

# Load a blank English model
nlp = spacy.blank("en")

# Add entity recognizer to the pipeline
ner = nlp.add_pipe("ner")

# Sample training data (replace with your labeled entity data)
TRAIN_DATA = [
    ("Buy Nike shoes", {"entities": [(4, 8, "BRAND")]}),
    # Add more examples
]

# Train the NER model
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Train the NER model with training data
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(10):
        for texts, annotations in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(texts), annotations)
            nlp.update([example], sgd=optimizer)

# Test the NER model
doc = nlp("I want to buy Nike shoes")
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


Entities: [('Nike', 'BRAND')]


**Sentiment Analysis**

In [None]:
from transformers import pipeline

# Load pre-trained sentiment analysis model
sentiment_analysis = pipeline("sentiment-analysis")

# Sample text
text = "This product is great!"

# Analyze sentiment
result = sentiment_analysis(text)
print(result)


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998793601989746}]


**Feature extraction: Extract relevant features from both text and image data using the trained models.**

**Text Feature Extraction using BERT:**

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example text
text = "Your text goes here."

# Tokenize and encode the text
tokens = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')

# Get BERT embeddings
with torch.no_grad():
    outputs = model(tokens)
    embeddings = outputs.last_hidden_state  # Last layer hidden-state of the tokens

# Use the embeddings as features for downstream tasks
text_features = embeddings.mean(dim=1)  # For example, taking the mean of token embeddings


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

**Image Feature Extraction using VGG16:**

In [None]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
import numpy as np

# Load pre-trained VGG16 model without the top (fully connected) layers
model = VGG16(weights='imagenet', include_top=False)

# Example image path
img_path = 'path_to_your_image.jpg'

# Load and preprocess the image
img = image.load_img(img_path, target_size=(224, 224))
img_array = image.img_to_array(img)
img_array = np.expand_dims(img_array, axis=0)
img_array = preprocess_input(img_array)

# Get VGG16 features
img_features = model.predict(img_array)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


FileNotFoundError: ignored

In [None]:
pip install boto3

Collecting boto3
  Downloading boto3-1.34.6-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.35.0,>=1.34.6 (from boto3)
  Downloading botocore-1.34.6-py3-none-any.whl (11.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.0-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.34.6 botocore-1.34.6 jmespath-1.0.1 s3transfer-0.10.0


In [None]:
import boto3
import random  # To generate random passwords

sns_client = boto3.client('sns', region_name='ap-south-1')

to_phone_number = '+917010054285'  # Replace with recipient's phone number
message = f"Test message sent from python code"

try:
    response = sns_client.publish(
        PhoneNumber=to_phone_number,
        Message=message
    )
    print("SMS message sent successfully:", response['MessageId'])
except Exception as e:
    print("Error sending SMS:", e)


Error sending SMS: Unable to locate credentials


**Textual Authenticity: - Plagiarism detection: Identify potentially plagiarized content from existing ads or other sources using NLP techniques like similarity matching, semantic analysis, and topic modeling.**

In [None]:
pip uninstall nltk


Found existing installation: nltk 3.8.1
Uninstalling nltk-3.8.1:
  Would remove:
    /usr/local/bin/nltk
    /usr/local/lib/python3.10/dist-packages/nltk-3.8.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/nltk/*
Proceed (Y/n)? y
  Successfully uninstalled nltk-3.8.1


In [None]:
pip install -U nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

NameError: ignored

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation

# Sample texts to compare (replace these with your data)
original_text = "Your original ad text here."
suspicious_text = "Potentially plagiarized content goes here."

# Tokenization and preprocessing
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenization and lowercasing
    # Additional steps like removing stopwords, stemming, lemmatization can be added here
    processed_text = ' '.join(tokens)
    return processed_text

original_text_processed = preprocess_text(original_text)
suspicious_text_processed = preprocess_text(suspicious_text)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform([original_text_processed, suspicious_text_processed])

# Cosine similarity calculation
cosine_sim = cosine_similarity(vectorized_text)[0][1]
print(f"Cosine Similarity: {cosine_sim}")

# Topic modeling using Latent Dirichlet Allocation (LDA)
def perform_lda(texts):
    vectorizer_lda = TfidfVectorizer()
    vectors = vectorizer_lda.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=2, random_state=42)  # Change number of topics as needed
    lda.fit(vectors)
    return lda

texts_for_lda = [original_text_processed, suspicious_text_processed]
lda_model = perform_lda(texts_for_lda)

# Extracting topics from the LDA model
def get_topics(lda, vectorizer, n_words=10):
    feature_names = vectorizer.get_feature_names_out()
    topics = {}
    for topic_idx, topic in enumerate(lda.components_):
        top_feature_idxs = topic.argsort()[:-(n_words+1):-1]
        top_words = [feature_names[i] for i in top_feature_idxs]
        topics[f"Topic {topic_idx + 1}"] = top_words
    return topics

topics = get_topics(lda_model, vectorizer)
print("Topics:")
for topic, words in topics.items():
    print(f"{topic}: {', '.join(words)}")


LookupError: ignored

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.metrics import distance
# Function to extract linguistic features from text
def extract_features(text):
    # Tokenize text into words
    words = word_tokenize(text.lower())  # Convert to lowercase for consistency

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Calculate frequency distribution of words
    freq_dist = FreqDist(filtered_words)

    # Part-of-speech tagging
    pos_tags = pos_tag(words)

    return freq_dist, pos_tags

# Function to compare linguistic features
def compare_features(text1, text2):
    # Extract linguistic features for both texts
    features_text1 = extract_features(text1)
    features_text2 = extract_features(text2)

    # Compare frequency distributions
    freq_dist_similarity = nltk.JaccardDistance(features_text1[0], features_text2[0])

    # Compare part-of-speech tagging
    pos_tags_similarity = nltk.jaccard_distance(set(features_text1[1]), set(features_text2[1]))

    return freq_dist_similarity, pos_tags_similarity

# Example texts (one human-written, one AI-generated)
human_text = "The sun was setting behind the mountains, painting the sky in beautiful hues of orange and pink."
ai_text = "The data indicates that solar bodies were descending beyond the geographical elevations, coloring the atmosphere in vivid tones of red and yellow."

# Compare linguistic features of the example texts
similarity_freq_dist, similarity_pos_tags = compare_features(human_text, ai_text)

# Set a threshold for similarity
threshold = 0.2  # Adjust as needed based on testing and accuracy

# Check if the texts are similar or dissimilar based on the features
if similarity_freq_dist > threshold or similarity_pos_tags > threshold:
    print("The texts are similar. Possibly both are human-written or AI-generated.")
else:
    print("The texts are dissimilar. Likely one is human-written and the other is AI-generated.")


AttributeError: ignored

**Brand consistency: Analyze adherence to brand voice and messaging style for brand impersonation detection.**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Sample brand messages or phrases that represent the brand voice
brand_messages = [
    "Our products prioritize sustainability and innovation.",
    "We strive to provide top-quality service to our customers.",
    "Innovation is at the heart of everything we do."
]

# Tokenization, stop words removal, and lemmatization function
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatization
    return ' '.join(tokens)

# Preprocess brand messages
processed_brand_messages = [preprocess_text(msg) for msg in brand_messages]

# Vectorize brand messages using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_brand_messages)

def check_similarity(input_text):
    # Preprocess the input text
    processed_input = preprocess_text(input_text)

    # Vectorize the input text
    input_tfidf = vectorizer.transform([processed_input])

    # Calculate cosine similarity between input text and brand messages
    similarity_scores = cosine_similarity(input_tfidf, tfidf_matrix)

    # Calculate average similarity score
    avg_similarity = similarity_scores.mean()

    return avg_similarity

# Example usage: Check similarity of an incoming message with brand messages
incoming_message = "Discover our innovative and sustainable products!"
similarity_score = check_similarity(incoming_message)

# Define a threshold for similarity score to detect brand consistency
threshold = 0.5  # Adjust according to your requirements

if similarity_score >= threshold:
    print("The incoming message adheres to the brand voice.")
else:
    print("The incoming message might not align with the brand voice.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


The incoming message might not align with the brand voice.


**Image manipulation detection: Identify signs of image tampering, editing, or deepfakes using image forensics analysis.**

In [None]:
import cv2
import numpy as np

def error_level_analysis(image_path, save_path):
    # Load the image
    img = cv2.imread(image_path)

    # Save the image with JPEG compression to compute error level
    cv2.imwrite("temp.jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 90])

    # Read the saved image
    temp_img = cv2.imread("temp.jpg")

    # Compute the absolute difference between the original and re-saved image
    ela = cv2.absdiff(img, temp_img)

    # Convert the error level analysis image to grayscale
    ela_gray = cv2.cvtColor(ela, cv2.COLOR_BGR2GRAY)

    # Save the ELA image
    cv2.imwrite(save_path, ela_gray)

    # Display the ELA image
    cv2.imshow("Error Level Analysis", ela_gray)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

# Replace 'input_image.jpg' with the path to your image
input_image_path = 'input_image.jpg'

# Replace 'output_ela.jpg' with the desired path to save the ELA image
output_ela_path = 'output_ela.jpg'

error_level_analysis(input_image_path, output_ela_path)


error: ignored

**Reverse image search: Check for potential copyright infringement or unauthorized use of existing images.**

In [None]:
import requests

def reverse_image_search(api_key, image_url):
    endpoint = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": api_key,
        "cx": "YOUR_CUSTOM_SEARCH_ENGINE_ID",
        "searchType": "image",
        "imgUrl": image_url
    }

    try:
        response = requests.get(endpoint, params=params)
        if response.status_code == 200:
            search_results = response.json()
            # Process search results here
            return search_results
        else:
            print("Request failed with status code:", response.status_code)
    except requests.RequestException as e:
        print("Request Exception:", e)

# Usage
api_key = "YOUR_GOOGLE_API_KEY"
image_url = "URL_OF_YOUR_IMAGE"
search_results = reverse_image_search(api_key, image_url)
print(search_results)


Request failed with status code: 400
None


**Logo and brand element verification: Ensure consistent and correct usage of official logos and brand assets.**

In [None]:
from PIL import Image
import imagehash

def image_similarity(image_path1, image_path2):
    # Open and convert images to grayscale
    img1 = Image.open(image_path1).convert('L')
    img2 = Image.open(image_path2).convert('L')

    # Calculate the hash for images
    hash1 = imagehash.average_hash(img1)
    hash2 = imagehash.average_hash(img2)

    # Compute the similarity between the images
    return hash1 - hash2

def verify_logo(official_logo_path, image_to_verify_path, threshold=5):
    similarity = image_similarity(official_logo_path, image_to_verify_path)

    if similarity <= threshold:
        print("The image matches the official logo!")
    else:
        print("The image doesn't match the official logo.")

# Paths to the official logo and image to verify
official_logo_path = 'path_to_official_logo.png'
image_to_verify_path = 'path_to_image_to_verify.png'

# Verify the image against the official logo
verify_logo(official_logo_path, image_to_verify_path)


ModuleNotFoundError: ignored

**Claims Validation: Product/service claims: Analyze specific claims and promises made in the ad against publicly available information and data to verify their accuracy.**

In [None]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Function to scrape a webpage and extract text content
def scrape_webpage(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = ' '.join([p.get_text() for p in soup.find_all('p')])  # Extract text from <p> tags
        return text
    except Exception as e:
        print(f"Error: {e}")
        return None

# Function to analyze claims against public data
def validate_claims(claims, public_data):
    # Tokenize and clean claims text
    claims_tokens = word_tokenize(claims.lower())
    stop_words = set(stopwords.words('english'))
    claims_tokens = [word for word in claims_tokens if word.isalnum() and word not in stop_words]

    # Tokenize and clean public data text
    public_tokens = word_tokenize(public_data.lower())
    public_tokens = [word for word in public_tokens if word.isalnum() and word not in stop_words]

    # Count word frequencies
    claims_word_freq = Counter(claims_tokens)
    public_word_freq = Counter(public_tokens)

    # Compare claim words against public data words
    matching_words = set(claims_word_freq.keys()) & set(public_word_freq.keys())

    # Calculate similarity score based on common words
    similarity_score = sum(claims_word_freq[word] for word in matching_words) / len(claims_tokens)

    return similarity_score

# Example claim and public data (you can replace these with real data)
claim = "Our product reduces energy consumption by 50%."
public_data_url = "https://example.com/public-data-page"  # URL with relevant data

# Scrape public data webpage
public_text = scrape_webpage(public_data_url)

if public_text:
    # Validate the claim against public data
    similarity_score = validate_claims(claim, public_text)
    print(f"Claim: {claim}")
    print(f"Public data URL: {public_data_url}")
    print(f"Similarity score: {similarity_score}")
else:
    print("Failed to retrieve public data.")


Claim: Our product reduces energy consumption by 50%.
Public data URL: https://example.com/public-data-page
Similarity score: 0.0


**Performance and results: Verify claims about product performance, results, and statistics using trusted sources and independent reviews.**

In [None]:
import requests

def verify_product_claim(product_name):
    # Define the API endpoint or URL to retrieve data
    api_url = f"https://example_api.com/products/{product_name}"

    try:
        # Make a GET request to the API
        response = requests.get(api_url)

        if response.status_code == 200:
            # Assuming the API returns JSON data
            product_data = response.json()

            # Extract relevant information for comparison
            claimed_performance = product_data['claimed_performance']
            actual_performance = product_data['actual_performance']  # Actual data from trusted sources

            # Compare claimed performance with actual data
            if claimed_performance == actual_performance:
                print(f"The claim about {product_name} performance is accurate.")
            else:
                print(f"The claim about {product_name} performance is not accurate.")
                print(f"Claimed Performance: {claimed_performance}")
                print(f"Actual Performance: {actual_performance}")
        else:
            print(f"Failed to fetch data for {product_name}. Status Code: {response.status_code}")

    except requests.RequestException as e:
        print(f"An error occurred: {e}")

# Example usage:
product_to_verify = "example_product"
verify_product_claim(product_to_verify)


An error occurred: HTTPSConnectionPool(host='example_api.com', port=443): Max retries exceeded with url: /products/example_product (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f62cf00f310>: Failed to resolve 'example_api.com' ([Errno -2] Name or service not known)"))


**Comparative claims: Assess the validity of comparisons made with competitor products or services through data analysis and research.**

In [None]:
import pandas as pd

# Load data for your product and competitor
your_product_data = pd.read_csv('')
competitor_data = pd.read_csv('competitor_data.csv')

# Define comparison metrics (e.g., price, ratings, features)
comparison_metrics = ['price', 'ratings', 'features']

# Calculate mean/median/other statistics for comparison
for metric in comparison_metrics:
    your_metric_avg = your_product_data[metric].mean()
    competitor_metric_avg = competitor_data[metric].mean()

    print(f"Your product's average {metric}: {your_metric_avg}")
    print(f"Competitor's average {metric}: {competitor_metric_avg}")
    print("------------")


FileNotFoundError: ignored

**Audience analysis: Determine if the ad content is truly relevant and appropriate for the targeted audience based on demographics, interests, and online behavior**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Assume 'X' contains features and 'y' contains labels indicating ad relevance

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Decision Tree Classifier (you can try other classifiers as well)
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


NameError: ignored

**Contextual relevance: Analyze the fit of the ad with the context in which it appears, such as the specific website, publication, or social media platform.**

In [None]:
# Sample list of keywords related to context
context_keywords = ['technology', 'innovation', 'digital', 'startup', 'entrepreneurship']

# Sample ad content
ad_content = "Introducing the latest tech innovation for startups. Join us now!"

# Function to analyze ad fit with context based on keywords
def analyze_ad_fit(ad_content, context_keywords):
    ad_keywords = ad_content.lower().split()  # Extracting keywords from ad content
    matching_keywords = [keyword for keyword in ad_keywords if keyword in context_keywords]

    # Calculating relevance score based on keyword matches
    relevance_score = len(matching_keywords) / len(ad_keywords)

    return relevance_score

# Analyzing the fit of the ad with the context
fit_score = analyze_ad_fit(ad_content, context_keywords)
print(f"The ad's fit with the context: {fit_score:.2f}")


The ad's fit with the context: 0.10


In [None]:
!apt-get install -y xvfb
import os
os.system('Xvfb :1 -screen 0 1600x1200x16  &')
os.environ['DISPLAY']=':1.0'

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common
The following NEW packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common xvfb
0 upgraded, 9 newly installed, 0 to remove and 24 not upgraded.
Need to get 7,813 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libfontenc1 amd64 1:1.1.4-1build3 [14.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxfont2 amd64 1:2.0.5-1build1 [94.5 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxkbfile1 amd64 1:1.1.0-1build3 [71.8 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 x11-xkb-utils amd64 7.7+5build4 [172 kB]
Get:5 http://archiv

In [None]:
import tkinter as tk
from tkinter import messagebox


# Function to perform the verification process
def verify():
    website_url = website_entry.get()
    ad_text = ad_entry.get()
    phone_number = phone_entry.get()

    # Perform verification (dummy verification for demonstration)
    if website_url:
        result_label.config(text="Website Verified: Legitimate")
    elif ad_text:
        result_label.config(text="Ad Verified: Not Fraudulent")
    elif phone_number:
        result_label.config(text="Phone Number Verified: Authentic")
    else:
        messagebox.showwarning("Warning", "Please enter valid information.")

# Create the main window
print(tk.Tk())
root = tk.Tk()
root.title("Scam Verification Tool")

# Create and place input fields
website_label = tk.Label(root, text="Enter Website URL:")
website_label.pack()
website_entry = tk.Entry(root)
website_entry.pack()

ad_label = tk.Label(root, text="Enter Ad Text/Link:")
ad_label.pack()
ad_entry = tk.Entry(root)
ad_entry.pack()

phone_label = tk.Label(root, text="Enter Phone Number:")
phone_label.pack()
phone_entry = tk.Entry(root)
phone_entry.pack()

# Button to initiate verification
verify_button = tk.Button(root, text="Verify", command=verify)
verify_button.pack()

# Section to display results
result_label = tk.Label(root, text="", fg="green", font=("Arial", 12, "bold"))
result_label.pack()

# Run the main loop
root.mainloop()

.


In [None]:
import tkinter as tk

# Create the main window
root = tk.Tk()
root.title("Scam/Fraud Checker")

# Input fields
website_label = tk.Label(root, text="Website URL:")
website_entry = tk.Entry(root)
ad_label = tk.Label(root, text="Ad text/link:")
ad_entry = tk.Entry(root)
phone_label = tk.Label(root, text="Phone number:")
phone_entry = tk.Entry(root)

# Verification button
verify_button = tk.Button(root, text="Verify", command=verify_data)  # Function to call for verification

# Results section
results_label = tk.Label(root, text="Results:")
results_text = tk.Text(root, height=5, width=40)  # Allow for multi-line results

# Layout the elements
website_label.grid(row=0, column=0)
website_entry.grid(row=0, column=1)
ad_label.grid(row=1, column=0)
ad_entry.grid(row=1, column=1)
phone_label.grid(row=2, column=0)
phone_entry.grid(row=2, column=1)
verify_button.grid(row=3, column=0, columnspan=2)
results_label.grid(row=4, column=0)
results_text.grid(row=5, column=0, columnspan=2)

# Function to handle verification (replace with your actual verification logic)
def verify_data():
    website_url = website_entry.get()
    ad_text = ad_entry.get()
    phone_number = phone_entry.get()

    # Perform verification checks using backend logic or external services
    # ...

    # Display results in the results_text widget, using clear language and visual cues
    results_text.delete("1.0", tk.END)  # Clear previous results
    results_text.insert(tk.END, "Verification results:\n")
    # ... (append detailed results for each item)

root.mainloop()


TclError: ignored