<a href="https://colab.research.google.com/github/pravallika-5/cybersecurity-threat-data-scraper/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
import os, shutil, textwrap

# Mount your Google Drive
drive.mount('/content/drive')

# Define your project directory inside Drive
PROJECT_DIR = "/content/drive/MyDrive/threat-data-scraper"
DATA_DIR = f"{PROJECT_DIR}/sample_data"
OUT_DIR = f"{PROJECT_DIR}/outputs"

# Clean slate (optional, wipes old run if exists)
if os.path.exists(PROJECT_DIR):
    shutil.rmtree(PROJECT_DIR)

# Make fresh directories
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

print("✅ Project directories created:")
print(PROJECT_DIR, DATA_DIR, OUT_DIR, sep="\n")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Project directories created:
/content/drive/MyDrive/threat-data-scraper
/content/drive/MyDrive/threat-data-scraper/sample_data
/content/drive/MyDrive/threat-data-scraper/outputs


In [4]:
sample_html = """
<html>
  <body>
    <h2>Forum Post</h2>
    <p>User: alice | Password: wonderland123 | Email: alice@example.com</p>
    <p>Username: bob | Pass: qwerty!@# | Email: bob_the_builder@fake.net</p>
    <p>Contact admin: admin@darkforum.xyz</p>
  </body>
</html>
"""

sample_file_path = f"{DATA_DIR}/sample_forum.html"

with open(sample_file_path, "w", encoding="utf-8") as f:
    f.write(textwrap.dedent(sample_html))

print(f"Sample HTML file created at: {sample_file_path}")


Sample HTML file created at: /content/drive/MyDrive/threat-data-scraper/sample_data/sample_forum.html


In [5]:
!pip install beautifulsoup4 pandas




In [6]:
from bs4 import BeautifulSoup

file_path = f"{DATA_DIR}/sample_forum.html"

with open(file_path, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# Extract all <p> tags as sample posts
posts = [p.get_text() for p in soup.find_all("p")]
print("✅ Extracted posts:", posts)


✅ Extracted posts: ['User: alice | Password: wonderland123 | Email: alice@example.com', 'Username: bob | Pass: qwerty!@# | Email: bob_the_builder@fake.net', 'Contact admin: admin@darkforum.xyz']


In [7]:
import re

emails, usernames, passwords = [], [], []

for post in posts:
    # Extract emails
    emails += re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", post)

    # Extract usernames
    usernames += re.findall(r"User(?:name)?:\s*([a-zA-Z0-9_]+)", post, re.IGNORECASE)

    # Extract passwords
    passwords += re.findall(r"Pass(?:word)?:\s*([^\s|]+)", post, re.IGNORECASE)

print("✅ Emails:", emails)
print("✅ Usernames:", usernames)
print("✅ Passwords:", passwords)


✅ Emails: ['alice@example.com', 'bob_the_builder@fake.net', 'admin@darkforum.xyz']
✅ Usernames: ['alice', 'bob']
✅ Passwords: ['wonderland123', 'qwerty!@#']


In [9]:
from itertools import zip_longest

df = pd.DataFrame(
    list(zip_longest(usernames, emails, passwords, fillvalue=None)),
    columns=["Username", "Email", "Password"]
)

print("✅ Extracted DataFrame (with None padding):")
print(df)


✅ Extracted DataFrame (with None padding):
  Username                     Email       Password
0    alice         alice@example.com  wonderland123
1      bob  bob_the_builder@fake.net      qwerty!@#
2     None       admin@darkforum.xyz           None


In [10]:
output_path = f"{OUT_DIR}/extracted_data.csv"
df.to_csv(output_path, index=False)

print(f"✅ Data saved to: {output_path}")


✅ Data saved to: /content/drive/MyDrive/threat-data-scraper/outputs/extracted_data.csv


In [12]:
import re

def password_strength(pw):
    if not pw or not isinstance(pw, str):   # handle None or non-strings
        return "Unknown"
    if len(pw) < 8:
        return "Weak"
    elif re.search(r"[A-Z]", pw) and re.search(r"[0-9]", pw) and re.search(r"[^a-zA-Z0-9]", pw):
        return "Strong"
    else:
        return "Medium"

df["Password_Strength"] = df["Password"].apply(password_strength)

print("✅ With Password Strength:")
print(df)


✅ With Password Strength:
  Username                     Email       Password Password_Strength
0    alice         alice@example.com  wonderland123            Medium
1      bob  bob_the_builder@fake.net      qwerty!@#            Medium
2     None       admin@darkforum.xyz           None           Unknown


In [13]:
summary = {
    "Total Entries": len(df),
    "Unique Emails": df["Email"].nunique(),
    "Weak Passwords": (df["Password_Strength"]=="Weak").sum(),
    "Medium Passwords": (df["Password_Strength"]=="Medium").sum(),
    "Strong Passwords": (df["Password_Strength"]=="Strong").sum()
}

print("✅ Summary Report:")
for k,v in summary.items():
    print(f"{k}: {v}")


✅ Summary Report:
Total Entries: 3
Unique Emails: 3
Weak Passwords: 0
Medium Passwords: 2
Strong Passwords: 0


In [14]:
import pandas as pd
import os

# Save summary into a DataFrame
summary_df = pd.DataFrame(list(summary.items()), columns=["Metric", "Value"])

# Path to save inside Google Drive
summary_path = f"{OUT_DIR}/summary_report.csv"

# Export to CSV
summary_df.to_csv(summary_path, index=False)

print(f"✅ Summary exported to: {summary_path}")


✅ Summary exported to: /content/drive/MyDrive/threat-data-scraper/outputs/summary_report.csv
