In [27]:
import pandas as pd

# Load your dataset
df = pd.read_csv("data/phiusiil.csv")

# View the column names and first few rows
print(df.columns)
print(df.head())


Index(['URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP', 'TLD',
       'URLSimilarityIndex', 'CharContinuationRate', 'TLDLegitimateProb',
       'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation',
       'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL',
       'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL',
       'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL',
       'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS',
       'LineOfCode', 'LargestLineLength', 'HasTitle', 'Title',
       'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon', 'Robots',
       'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription',
       'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet',
       'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay',
       'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS',
       'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef', 'labe

In [28]:
df['label'] = df['category'].map({
    'legitimate': 1,
    'phishing': 0
})
df = df.dropna(subset=['Label'])  # Drop rows with NaN labels


KeyError: 'category'

In [None]:
import pandas as pd

df = pd.read_csv("data/phiusiil.csv")
print(df.columns)


In [None]:
import pandas as pd

df = pd.read_csv("data/phiusiil.csv")  # or correct path

print(df['label'].value_counts())


In [None]:


df.rename(columns={'label': 'Label'}, inplace=True)




In [None]:
df = df.dropna(subset=['Label'])


In [36]:
from sklearn.utils import resample

df_legit = df[df['Label'] == 1]
df_phish = df[df['Label'] == 0]

min_len = min(len(df_legit), len(df_phish))

df_legit_bal = resample(df_legit, replace=False, n_samples=min_len, random_state=42)
df_phish_bal = resample(df_phish, replace=False, n_samples=min_len, random_state=42)

df_balanced = pd.concat([df_legit_bal, df_phish_bal])
df_balanced = df_balanced.sample(frac=1, random_state=42)


KeyError: 'Label'

In [35]:
from src.features import extract_url_features

features = []
labels = []

for url, label in zip(df_balanced['URL'], df_balanced['Label']):
    try:
        feats = extract_url_features(url)
        features.append(feats)
        labels.append(label)
    except:
        continue  # Skip invalid URLs

X = pd.DataFrame(features)
y = pd.Series(labels)


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use class weights to improve performance
from sklearn.utils import class_weight
weights = class_weight.compute_sample_weight('balanced', y_train)

model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train, sample_weight=weights)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.99      1.00     20250
           1       0.99      1.00      1.00     20128

    accuracy                           1.00     40378
   macro avg       1.00      1.00      1.00     40378
weighted avg       1.00      1.00      1.00     40378



In [40]:
test_url = "https://www.google.com"
feat = extract_url_features(test_url)
result = model.predict(pd.DataFrame([feat]))[0]
print("🟢 Legitimate" if result == 1 else "🔴 Phishing")


🟢 Legitimate


In [42]:
model.predict(pd.DataFrame([extract_url_features("https://www.google.com")]))
model.predict(pd.DataFrame([extract_url_features("http://freebitco.in.win-prize.xyz")]))


array([0])

In [47]:
import pickle
pickle.dump(model, open("saved_models/phishing_model.pkl", "wb"))
