In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("URL_dataset.csv")
df

Unnamed: 0,url,type
0,https://www.google.com,legitimate
1,https://www.youtube.com,legitimate
2,https://www.facebook.com,legitimate
3,https://www.baidu.com,legitimate
4,https://www.wikipedia.org,legitimate
...,...,...
465868,https://bill.com/login,legitimate
465869,https://waveapps.com/login,legitimate
465870,https://intuit.com/login,legitimate
465871,https://xero.com/login,legitimate


In [3]:
df.describe

<bound method NDFrame.describe of                                  url        type
0             https://www.google.com  legitimate
1            https://www.youtube.com  legitimate
2           https://www.facebook.com  legitimate
3              https://www.baidu.com  legitimate
4          https://www.wikipedia.org  legitimate
...                              ...         ...
465868        https://bill.com/login  legitimate
465869    https://waveapps.com/login  legitimate
465870      https://intuit.com/login  legitimate
465871        https://xero.com/login  legitimate
465872  https://freshbooks.com/login  legitimate

[465873 rows x 2 columns]>

In [4]:
df["num_label"] = df["type"].map({"legitimate": 0, "phishing": 1})

In [5]:
url = df["url"]
type = df["num_label"]

In [6]:
url_train, url_test, type_train, type_test = train_test_split(
    url, type, test_size=0.2, random_state=42
)

In [7]:
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=40000)
url_train = vectorizer.fit_transform(url_train)
url_test = vectorizer.transform(url_test)

In [8]:
logistic_regression = LogisticRegression(max_iter=500, solver="liblinear")
logistic_regression.fit(url_train, type_train)


In [9]:
prediction = logistic_regression.predict(url_test)

In [10]:
print(f"Accuracy (Logistic Regression): {accuracy_score(type_test, prediction) * 100}")

Accuracy (Logistic Regression): 99.67909847061979


In [11]:
examples = [
    "https://www.apple.com",
    "http://paypal-login-security-verify-user.com",
    "https://github.com",
    "http://claim-free-iphone-now.com"
]

In [12]:
example_vectors = vectorizer.transform(examples)
predictions = logistic_regression.predict(example_vectors)

In [13]:
for url, i in zip(examples,  predictions):
    print(f"{url} is {'Phishing' if i == 1 else 'Legitimate'}")

https://www.apple.com is Legitimate
http://paypal-login-security-verify-user.com is Phishing
https://github.com is Legitimate
http://claim-free-iphone-now.com is Phishing


In [14]:
# Random Forest

In [15]:
random_forest = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
random_forest.fit(url_train, type_train)

In [16]:
rf_prediction = random_forest.predict(url_test)
print(f"Accuracy (Random Forest): {accuracy_score(type_test, rf_prediction) * 100}")

Accuracy (Random Forest): 99.79608264019319
