**Keyword Analysis**

In [1]:
import re
import requests

# List of scam-related keywords and phrases
scam_keywords = [
    "get rich quick",
    "guaranteed income",
    "miracle cure",
    "you have won",
    "",
    # Add more keywords/phrases as needed
]

def check_for_scam_keywords(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html_content = response.text

            # Search for scam keywords in the HTML content
            for keyword in scam_keywords:
                matches = re.findall(keyword, html_content, re.IGNORECASE)
                if matches:
                    print(f"Found '{keyword}' on the webpage. This could indicate scam content.")

        else:
            print(f"Failed to fetch content. Status code: {response.status_code}")

    except requests.RequestException as e:
        print(f"An error occurred: {e}")
    print("No suspicious found")

# Replace with the URL of the website you want to check for scam-related keywords
check_for_scam_keywords('https://www.google.com/')


Found '' on the webpage. This could indicate scam content.
No suspicious found


**Domain day time analysis**

In [2]:
import datetime
import requests

def get_domain_creation_date(domain):
  """
  Attempts to estimate the domain creation date using various methods.

  Args:
    domain: The domain name to check.

  Returns:
    A datetime object representing the estimated domain creation date, or None if unsuccessful.
  """

  # 1. Check IANA domain creation date (if available)
  try:
    url = f"https://icann.org/whois/iaa/lookup?q={domain}"
    response = requests.get(url)
    data = response.json()
    if "created" in data["domain"]:
      created_date = datetime.datetime.strptime(data["domain"]["created"][0], "%Y-%m-%dT%H:%M:%SZ")
      return created_date
  except Exception as e:
    print(f"Error fetching IANA data for {domain}: {e}")

  # 2. Check DNS record for domain age
  try:
    domain_age = None
    for record_type in ["SOA", "NS"]:
      response = requests.get(f"https://dns.google.com/resolve?name={domain}&type={record_type}")
      data = response.json()
      for record in data["Answer"]:
        if "ttl" in record:
          domain_age = max(domain_age or 0, int(record["ttl"]) / 86400)  # Convert TTL to days
    if domain_age:
      return datetime.datetime.utcnow() - datetime.timedelta(days=domain_age)
  except Exception as e:
    print(f"Error fetching DNS record for {domain}: {e}")

  # 3. Fallback to current date (indicates no reliable data found)
  return datetime.datetime.utcnow()

# Example usage
domain = "https://developers.google.com/safe-browsing/v4"
estimated_creation_date = get_domain_creation_date(domain)

if estimated_creation_date:
  print(f"Estimated creation date for {domain}: {estimated_creation_date}")
else:
  print(f"Unable to determine creation date for {domain}")

Error fetching IANA data for https://developers.google.com/safe-browsing/v4: Expecting value: line 1 column 1 (char 0)
Error fetching DNS record for https://developers.google.com/safe-browsing/v4: 'Answer'
Estimated creation date for https://developers.google.com/safe-browsing/v4: 2023-12-19 12:53:04.451813


**Code for grammatical errors scam words types**

In [3]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import nltk

# List of scam, threatening, and grammatical error words
scam_words = [
    'urgent',
    'verify',
    'limited time',
    'account suspended',
    'confirm your identity',
    'win a prize',
    'act now',
    'final notice',
    'your account has been compromised',
    'unauthorized access',
    'lottery winner',
    'phishing',
    'fraudulent activity detected',
    'click here',
    'special promotion',
    'IRS',
    'update your information',
    'virus detected',
    'suspicious activity',
    'refund',
    'exclusive offer',
    'prize claim',
    'risk-free',
    'bank account verification',
    # Add more scam and threatening words as needed

    # Grammatical error words
    'yourselfs',
    'accomodate',
    'beleive',
    'recieve',
    'unecessary',
    'thier',
    'beleive',
    'definately',
    'grammer',
    'sucessful',
    # Add more words associated with grammatical errors
]

# Download NLTK resources (run once)
nltk.download('punkt')

# Function to check for potential threatening or scam words
def check_for_threatening_words(text):
    words_list = word_tokenize(text.lower())  # Tokenize text into words
    threatening_words_found = [word for word in words_list if word in scam_words]

    return threatening_words_found

# URL of the website you want to check
url = 'https://www.udemy.com/'  # Replace with the website URL

# Fetch webpage content
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract text content from the webpage
    text = soup.get_text()

    # Check for potential threatening words
    threatening_words_found = check_for_threatening_words(text)

    if threatening_words_found:
        print("Potential threatening words found:")
        print(threatening_words_found)
    else:
        print("No potential threatening words detected in the text.")
else:
    print("Failed to retrieve the webpage.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


No potential threatening words detected in the text.


**URL length checking**

In [10]:
import re

def check_url(url):
    # Check URL length
    if len(url) > 100:  # Adjust the length threshold as needed
        print("Warning: Excessively long URL")

    # Check for unusual characters or multiple hyphens
    if re.search(r'[^a-zA-Z0-9-:/._]', url):
        print("Warning: Unusual characters found in the URL")

    if re.search(r'[-]{2,}', url):
        print("Warning: Multiple consecutive hyphens found in the URL")

# Example fake URLs (replace with actual URLs to test)
fake_urls = [
    "https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302",
    "https://www.huffingtonpost.com/entry/if-you-see-a-muslim-at-the-airport_us_588ddf13e4b0cd25e49049d8",
    "https://www.udemy.com/"
]

for url in fake_urls:
    print("Checking URL:", url)
    check_url(url)

Checking URL: https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302
Checking URL: https://www.huffingtonpost.com/entry/if-you-see-a-muslim-at-the-airport_us_588ddf13e4b0cd25e49049d8
Checking URL: https://www.udemy.com/


**Broken links and functionality: Malfunctioning features, broken links, or missing pages can point to poorly maintained websites or scams.**

In [11]:
import requests

def check_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            print(f"URL {url} is working fine!")
        else:
            print(f"Broken link detected! URL: {url}, Status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Failed to connect to URL {url}. Error: {e}")

# Replace these URLs with the ones you want to check
urls_to_check = [
    'https://www.example.com',
    'https://www.example.com/brokenlink',  # Example broken link
    'https://www.example.com/missingpage', # Example missing page
]

for url in urls_to_check:
    check_url(url)


URL https://www.example.com is working fine!
Broken link detected! URL: https://www.example.com/brokenlink, Status code: 404
Broken link detected! URL: https://www.example.com/missingpage, Status code: 404


**Excessive pop-ups and ads: Intrusive pop-ups, aggressive advertising, or redirects to unrelated websites might indicate malicious intent.**

In [12]:
pip install selenium


Collecting selenium
  Downloading selenium-4.16.0-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.23.2-py3-none-any.whl (461 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m461.6/461.6 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?

In [13]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Set up the Chrome browser
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-popup-blocking")  # Disable popup blocking
driver = webdriver.Chrome(chrome_options=chrome_options)

# Replace 'https://example.com' with the website URL you want to check
url = 'https://example.com'

# Navigate to the website
driver.get(url)

# Wait for page load
wait = WebDriverWait(driver, 10)

# Check for presence of pop-ups
pop_ups = wait.until(EC.number_of_windows_to_be(2))  # Check if more than one window is open

if pop_ups:
    print("This website has excessive pop-ups or redirects.")

# Check for intrusive elements (you might need to customize this based on the site's behavior)
intrusive_elements = driver.find_elements(By.XPATH, "//*[contains(@class, 'intrusive-class-name')]")

if intrusive_elements:
    print("This website contains intrusive elements.")

# You might need additional checks based on specific indicators of aggressive advertising or redirects

# Close the browser
driver.quit()


TypeError: ignored

**Missing security certificates: Ensure the website uses HTTPS with a valid SSL certificate for secure communication.**

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException

# Function to check SSL certificate presence

chrome_options = Options
driver_path = '/content/chromedriver.exe'
service = Service(driver_path)

# driver = webdriver.Chrome() #service=service, options=chrome_options
driver = webdriver.Chrome(executable_path = driver_path)
def check_ssl_certificate(url):
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run browser in headless mode

        # Set the path to your browser driver


        # Initialize the Chrome browser


        # Navigate to the URL
        driver.get(url)

        # Get the current URL after redirection (to handle possible redirects)
        final_url = driver.current_url

        # Check if the URL uses HTTPS
        if final_url.startswith('https'):
            print(f"The website {url} uses HTTPS.")
        else:
            print(f"The website {url} does not use HTTPS.")

    except WebDriverException as e:
        print(f"Error: {e}")
    finally:
        driver.quit()

# Example usage
website_url = "https://dictionary.cambridge.org/dictionary/english/book"  # Replace with the URL you want to check
check_ssl_certificate(website_url)


TypeError: ignored

**NLP model development: Train NLP models for text classification, entity recognition,
sentiment analysis, and fact-checking.**

**Text Classification**

In [None]:
# train_test_split

from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = 24)
X_train.shape, X_test.shape


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Sample data (replace with your labeled data)
texts = ["this is cute"]  # List of texts
labels = ["this is book"]  # List of corresponding labels



# Convert texts to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Support Vector Machine (SVM) classifier
clf = SVC(kernel='linear')
clf.fit(X_train_vectorized, y_train)

# Evaluate the classifier
accuracy = clf.score(X_test_vectorized, y_test)
print(f"Accuracy: {accuracy}")


NameError: ignored

**Entity Recognition**

In [16]:
import spacy
from spacy.training import Example

# Load a blank English model
nlp = spacy.blank("en")

# Add entity recognizer to the pipeline
ner = nlp.add_pipe("ner")

# Sample training data (replace with your labeled entity data)
TRAIN_DATA = [
    ("Buy Nike shoes", {"entities": [(4, 8, "BRAND")]}),
    # Add more examples
]

# Train the NER model
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Train the NER model with training data
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(10):
        for texts, annotations in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(texts), annotations)
            nlp.update([example], sgd=optimizer)

# Test the NER model
doc = nlp("I want to buy Nike shoes")
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


Entities: [('Nike', 'BRAND')]


**Sentiment Analysis**

In [17]:
from transformers import pipeline

# Load pre-trained sentiment analysis model
sentiment_analysis = pipeline("sentiment-analysis")

# Sample text
text = "This product is great!"

# Analyze sentiment
result = sentiment_analysis(text)
print(result)


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998793601989746}]


**Feature extraction: Extract relevant features from both text and image data using the trained models.**

**Text Feature Extraction using BERT:**

In [18]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example text
text = "Your text goes here."

# Tokenize and encode the text
tokens = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')

# Get BERT embeddings
with torch.no_grad():
    outputs = model(tokens)
    embeddings = outputs.last_hidden_state  # Last layer hidden-state of the tokens

# Use the embeddings as features for downstream tasks
text_features = embeddings.mean(dim=1)  # For example, taking the mean of token embeddings


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

**Image Feature Extraction using VGG16:**

In [19]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
import numpy as np

# Load pre-trained VGG16 model without the top (fully connected) layers
model = VGG16(weights='imagenet', include_top=False)

# Example image path
img_path = 'path_to_your_image.jpg'

# Load and preprocess the image
img = image.load_img(img_path, target_size=(224, 224))
img_array = image.img_to_array(img)
img_array = np.expand_dims(img_array, axis=0)
img_array = preprocess_input(img_array)

# Get VGG16 features
img_features = model.predict(img_array)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


FileNotFoundError: ignored