In [1]:
import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup

# URL of the page containing phishing URLs
url = "https://openphish.com/"

# Send a GET request to the URL
response = requests.get(url)

# Create a BeautifulSoup object from the response content
soup = BeautifulSoup(response.content, "html.parser")

print(soup.prettify)

<bound method Tag.prettify of <html lang="en">
<head>
<meta content="OpenPhish provides actionable intelligence data on active phishing threats." name="description"/>
<meta content="300" http-equiv="refresh"/>
<title>OpenPhish - Phishing Intelligence</title>
<link href="https://fonts.googleapis.com/css?family=Open+Sans:400,600&amp;subset=latin" rel="stylesheet" type="text/css"/>
<link href="static/table-min.css" rel="stylesheet" type="text/css"/>
<link href="static/bootstrap.css" rel="stylesheet" type="text/css"/>
<link href="static/site.css" rel="stylesheet" type="text/css"/>
<style>
.table td, .table th {
    overflow: hidden;
    text-overflow: ellipsis;
    white-space: nowrap;
    font-size: 14px;
}
.url_entry {
	color: #337ab7;
}
</style>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
<script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bo

In [2]:
# Find all URL entries with the class 'url_entry'
url_entries = soup.find_all(class_='url_entry')

# Extract the URLs
phishing_urls = [entry.text for entry in url_entries]

# Specify the CSV file path
csv_file = 'phishing_urls.csv'

# Save the URLs to the CSV file
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Phishing URL'])
    writer.writerows([[url] for url in phishing_urls])

print(f"Phishing URLs saved to {csv_file}.")

Phishing URLs saved to phishing_urls.csv.


In [3]:
df = pd.read_csv('phishing_urls.csv')

In [4]:
df.head

<bound method NDFrame.head of                                          Phishing URL
0   https://www.animagricola.farm/ssl/animagricola...
1                        https://fcv2.xyz/?s=RDETMGxC
2   https://www.securembly.org/nam/1ab41d5a-8833-4...
3                           https://elleganto.com.br/
4   http://pub-955b4ab970884f6ebc951948e5966969.r2...
5                       http://24.75.29.68/enrollment
6   https://ipfs.io/ipfs/QmQqgUohatR5fDg25dYJc2eaX...
7   https://jade-hulking-trumpet.glitch.me/public/...
8   https://ipfs.io/ipfs/QmcQch1P6Po7peexKx7UDp7Ta...
9   https://metamaskwalletrestore.office-on-the.ne...
10  https://bugoutdepot.us/kinhjgu/NewATT/New%20AT...
11  http://biswadip200.github.io/Netflix-Clone.git...
12            https://sociallycollect.com/odocapenter
13  https://ipfs.eth.aragon.network/ipfs/bafkreihc...
14       http://www.total-escort.com/exc/new%20excel/
15                        http://sahabat-malam.cloud/
16   http://layananbantuan.dana-id-web.my.id/main.ph

**TXT FILES**

In [5]:
import os
import sys

# Increase the field size limit
csv.field_size_limit(sys.maxsize)

urls = df['Phishing URL']

# Function to extract HTML content from a URL
def extract_html(url):
    try:
        response = requests.get(url, timeout=10)  # Add timeout parameter (adjust as needed)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while accessing {url}: {e}")
        return None

# Create a directory to store individual TXT files
txt_directory = 'txt_files'
os.makedirs(txt_directory, exist_ok=True)

# Iterate over the URLs and extract HTML content
for i, url in enumerate(urls):
    html = extract_html(url)

    if html is not None:
        # Specify the TXT file path
        txt_file = os.path.join(txt_directory, f'html_{i}.txt')

        # Save the HTML content as TXT
        with open(txt_file, 'w', encoding='utf-8') as file:
            file.write(html)

        print(f"HTML content of {url} saved as TXT: {txt_file}.")
    else:
        print(f"Skipping URL {url} due to an error.")

print("HTML content saved as TXT files in txt_files folder.")


HTML content of https://www.animagricola.farm/ssl/animagricola/Banks-ec/amJhbmtzQGJhbmtzLWVjLmNvbQ== saved as TXT: txt_files/html_0.txt.
HTML content of https://fcv2.xyz/?s=RDETMGxC saved as TXT: txt_files/html_1.txt.
HTML content of https://www.securembly.org/nam/1ab41d5a-8833-4af0-82be-e3c54d03997b/55914fdf-43ac-4604-b5a8-cb5be99f0a32/5e4a3c92-2b06-45fa-9b7a-4d20d50df3b6/login?id=MENYZUprQk9tMVZNaUpETTlrTDhjcy81a0ZRenBUQ3RISWRLdXFzQmo2VUxkby9TalR4UnJMOEdWSmdWMzBWVUpOSnpEaC9RTjdJWEN3engrZmJkeXJjV2R3dS9sa2EzVFpiQSsvYkVWNG1KbzdSLzE1OXZmZGMzb25lL3A5dDExbjlJRythdVQ0ZXRuY1BRQzRkbFZ0b0wvdVdZYzhkb2FmbkJPbnRGeGFzaEhnTEFtZzhyQ1NnSUplVmhHb2FrOVpnRkwrRW81L0RSTUJOWWlpUHFwVjQwVjBtMGJ3V2w3ZCtJRU1Yd0xWRlBSRndYRS9OUE51MStBcW9TTXRnWSt0aWYwNlY2UmJZNnZ5M2JGSFc1ZWJ1cjBITHJZUTZOMHhjNXRjY0srTnlTWWIvaDBJUS9BbVRHWnRKTkxSTDhoaEpQZ3JCTzZET0JYaGdJMDRRS3BSaE1yN1NvajhKVmFtTWc0ZW15Rk5JVmNJUWVKczI2R1lMcjIyZWFEVnNHajY5N3BlY3ZiK1JDUnd6dDIvL1Mvdz09 saved as TXT: txt_files/html_2.txt.
HTML content of https://elleganto.



HTML content of http://pub-955b4ab970884f6ebc951948e5966969.r2.dev/secondpage.html saved as TXT: txt_files/html_4.txt.
Error occurred while accessing http://24.75.29.68/enrollment: HTTPSConnectionPool(host='24.75.29.68', port=443): Max retries exceeded with url: /enrollment (Caused by SSLError(CertificateError("hostname '24.75.29.68' doesn't match either of 'nao.mtb.com', 'auth.mtb.com', 'm.mtb.com', 'pcl-auth.mtb.com', 'mls-auth.mtb.com'")))
Skipping URL http://24.75.29.68/enrollment due to an error.
HTML content of https://ipfs.io/ipfs/QmQqgUohatR5fDg25dYJc2eaXpoe9pq8YhQ8M9o7gtypRj?filename=dhl.html&data=YmFsQG1vcmFuc2hpcHBpbmcuY29t&subf=Transport-Label.pdf&file=Waybill.pdf saved as TXT: txt_files/html_6.txt.
HTML content of https://jade-hulking-trumpet.glitch.me/public/ntw.html?/NATWESTB.ANKCRE.CARD/info.htm saved as TXT: txt_files/html_7.txt.
HTML content of https://ipfs.io/ipfs/QmcQch1P6Po7peexKx7UDp7TazThKCu2pAsdnU9WSaUVta saved as TXT: txt_files/html_8.txt.
HTML content of https

In [6]:
import shutil
import os

# Specify the path of the "txt_files" folder in Colab
colab_folder_path = 'txt_files'

# Specify the destination path and filename for the ZIP file on your local machine
local_zip_path = '/content/txt_files.zip'

# Create a ZIP file containing the "txt_files" folder
shutil.make_archive('/content/txt_files', 'zip', colab_folder_path)

# Move the ZIP file to the desired location
shutil.move('/content/txt_files.zip', local_zip_path)

print("ZIP file created successfully.")

# Provide the download link for the ZIP file
print("Download the ZIP file from the following link:")
print(local_zip_path)


ZIP file created successfully.
Download the ZIP file from the following link:
/content/txt_files.zip
