#####Step 1 → Setup Environment
#####Step 2 → Load Phishing Dataset
#####Step 3 → Feature Engineering (URL)
#####Step 4 → Train ML Model
#####Step 5 → Build Risk Scoring Function
#####Step 6 → QR Code Detection
#####Step 7 → Integrate URL + QR
#####Step 8 → Test Real-Time Inputs
#####Step 9 → Save Model
#####Step 10 → (Optional Advanced) Deploy API

In [5]:
!pip install tldextract
!pip install pyzbar
!pip install opencv-python
!pip install python-whois
!pip install xgboost



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd

df = pd.read_csv("/content/url_features_extracted1.csv")
df.head()

Unnamed: 0,URL,url_length,has_ip_address,dot_count,https_flag,url_entropy,token_count,subdomain_count,query_param_count,tld_length,path_length,has_hyphen_in_domain,number_of_digits,tld_popularity,suspicious_file_extension,domain_name_length,percentage_numeric_chars,ClassLabel
0,https://keraekken-loagginnusa.godaddysites.com/,47,0,2,1,4.250669,6,1,1,3,1,1,0,1,0,12,0.0,0.0
1,https://metamsk01lgiix.godaddysites.com/,40,0,2,1,4.196439,6,1,1,3,1,0,2,1,0,12,5.0,0.0
2,http://myglobaltech.in/,23,0,1,0,3.93618,5,0,1,2,1,0,0,0,0,12,0.0,0.0
3,http://djtool-for-spotify.com/,30,0,1,0,3.89474,5,0,1,3,1,1,0,1,0,18,0.0,0.0
4,https://scearmcoommunnlty.com/invent/freind/get,47,0,1,1,4.143127,7,0,1,3,18,0,0,1,0,17,0.0,0.0


1️⃣ Load Dataset  
2️⃣ Data Cleaning  
3️⃣ EDA (Basic Analysis)  
4️⃣ Train ML Model  
5️⃣ Evaluate Model  
6️⃣ Risk Score Function  
7️⃣ Real-time URL Testing  
8️⃣ Save Model  

In [8]:
df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101219 entries, 0 to 101218
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   URL                        101219 non-null  object 
 1   url_length                 101219 non-null  int64  
 2   has_ip_address             101219 non-null  int64  
 3   dot_count                  101219 non-null  int64  
 4   https_flag                 101219 non-null  int64  
 5   url_entropy                101219 non-null  float64
 6   token_count                101219 non-null  int64  
 7   subdomain_count            101219 non-null  int64  
 8   query_param_count          101219 non-null  int64  
 9   tld_length                 101219 non-null  int64  
 10  path_length                101219 non-null  int64  
 11  has_hyphen_in_domain       101219 non-null  int64  
 12  number_of_digits           101219 non-null  int64  
 13  tld_popularity             10

Unnamed: 0,url_length,has_ip_address,dot_count,https_flag,url_entropy,token_count,subdomain_count,query_param_count,tld_length,path_length,has_hyphen_in_domain,number_of_digits,tld_popularity,suspicious_file_extension,domain_name_length,percentage_numeric_chars,ClassLabel
count,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101219.0,101218.0
mean,35.060996,0.484879,3.007775,0.405645,3.976112,6.885733,1.511515,1.010591,5.00327,8.538249,0.042897,8.508669,0.328288,0.146613,6.292218,24.136487,0.370883
std,16.805678,0.499774,0.946612,0.491019,0.307068,1.670271,0.624755,0.202794,2.917429,14.70175,0.202626,8.540811,0.469593,0.353721,5.342714,22.49026,0.483043
min,7.0,0.0,0.0,0.0,2.521641,1.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,0.0,2.0,0.0,3.770942,5.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
50%,31.0,0.0,3.0,0.0,3.937193,7.0,2.0,1.0,3.0,2.0,0.0,10.0,0.0,0.0,3.0,19.642857,0.0
75%,35.0,1.0,4.0,1.0,4.100817,8.0,2.0,1.0,8.0,7.0,0.0,15.0,1.0,0.0,10.0,47.058824,1.0
max,474.0,1.0,19.0,1.0,5.871503,48.0,5.0,14.0,10.0,317.0,1.0,164.0,1.0,1.0,36.0,65.957447,1.0


In [11]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# -----------------------------
# 1. Load URL Dataset
# -----------------------------
df_url = pd.read_csv("/content/url_features_extracted1.csv")

# Drop rows with NaN values and remove the 'URL' column
df_url.dropna(inplace=True)
df_url = df_url.drop("URL", axis=1)

X = df_url.drop("ClassLabel", axis=1)
y = df_url["ClassLabel"]

# -----------------------------
# 2. Train Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 3. Train Model
# -----------------------------
model_url = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)

model_url.fit(X_train, y_train)

# -----------------------------
# 4. Evaluation
# -----------------------------
pred = model_url.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

# -----------------------------
# 5. Save Model
# -----------------------------
pickle.dump(model_url, open("url_fraud_model.pkl", "wb"))

print("URL Model Saved ✅")

Accuracy: 0.9994566291246789
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12749
         1.0       1.00      1.00      1.00      7495

    accuracy                           1.00     20244
   macro avg       1.00      1.00      1.00     20244
weighted avg       1.00      1.00      1.00     20244

URL Model Saved ✅


In [12]:
print(X.shape)
print(X.columns)

(101218, 16)
Index(['url_length', 'has_ip_address', 'dot_count', 'https_flag',
       'url_entropy', 'token_count', 'subdomain_count', 'query_param_count',
       'tld_length', 'path_length', 'has_hyphen_in_domain', 'number_of_digits',
       'tld_popularity', 'suspicious_file_extension', 'domain_name_length',
       'percentage_numeric_chars'],
      dtype='object')


In [13]:
import os
print(os.listdir())

['.config', 'merged_url_dataset.csv', 'url_features_extracted1.csv', 'url_fraud_model.pkl', 'drive', 'sample_data']


1️⃣ Data Preprocessing  
2️⃣ Class Imbalance Handling  
3️⃣ Advanced Model (XGBoost Tuned)  
4️⃣ Hyperparameter Tuning  
5️⃣ SHAP Explainability  
6️⃣ Risk Engine  
7️⃣ Save + Deployment Ready Model  

In [18]:
df["ClassLabel"].value_counts()

Unnamed: 0_level_0,count
ClassLabel,Unnamed: 1_level_1
0.0,63678
1.0,37540


In [25]:
import pandas as pd
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# -----------------------------
# 1. Load Dataset
# -----------------------------
df = pd.read_csv("/content/merged_email_sms_spam_dataset.csv")


# Convert label
df["label"] = df["label"].map({"ham": 0, "spam": 1})

# -----------------------------
# 2. Text Cleaning
# -----------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text)
    return text

df["clean_text"] = df["text"].apply(clean_text)

# -----------------------------
# 3. TF-IDF
# -----------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

# -----------------------------
# 4. Train Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 5. Balanced XGBoost
# -----------------------------
scale_pos_weight = 3455 / 6820  # ≈ 0.51

model_text = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight
)

model_text.fit(X_train, y_train)

# -----------------------------
# 6. Evaluation
# -----------------------------
pred = model_text.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

# -----------------------------
# 7. Save Model
# -----------------------------
pickle.dump(model_text, open("text_fraud_model.pkl", "wb"))
pickle.dump(vectorizer, open("text_vectorizer.pkl", "wb"))

print("Model Saved Successfully ✅")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9781021897810219
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       681
           1       0.98      0.99      0.98      1374

    accuracy                           0.98      2055
   macro avg       0.98      0.97      0.98      2055
weighted avg       0.98      0.98      0.98      2055

Model Saved Successfully ✅
