# Sprint 2: Exploratory Data Analysis (EDA)

**Author:** Sergiu Ionut Pascaru (ID: 2310-111729)  
**Module:** CMP600 – Dissertation  
**Date:** December 2025  
**CRISP-DM Phase:** Data Understanding

---

## Objectives

1. Load and explore the Malicious URLs dataset from Kaggle
2. Analyze the distribution of URL categories (labels)
3. Analyze URL length characteristics
4. Identify data quality issues
5. Create visualizations for the dissertation

---

## 1. Import Libraries

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Load the Dataset

**Dataset:** Malicious URLs Dataset (Siddhartha, 2024)  
**Source:** https://www.kaggle.com/datasets/sid321axn/malicious-urls-dataset

Make sure you have downloaded the dataset and placed `malicious_urls.csv` in the `data/raw/` folder.

In [None]:
# Load the Malicious URLs dataset
print("Loading dataset...")
df = pd.read_csv('../data/raw/malicious_urls.csv')
print("Dataset loaded successfully!")
print(f"\nDataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

## 3. Initial Data Exploration

In [None]:
# Display first 10 rows
print("=" * 70)
print("FIRST 10 ROWS OF THE DATASET")
print("=" * 70)
df.head(10)

In [None]:
# Dataset information
print("=" * 70)
print("DATASET INFORMATION")
print("=" * 70)
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Check for missing values
print("=" * 70)
print("MISSING VALUES CHECK")
print("=" * 70)
missing = df.isnull().sum()
print(f"\nMissing values per column:")
print(missing)
print(f"\nTotal missing values: {missing.sum()}")

if missing.sum() == 0:
    print("\n✓ No missing values found - data is complete!")
else:
    print(f"\n⚠ Warning: {missing.sum()} missing values detected!")

## 4. Label Distribution Analysis

In [None]:
# Label distribution analysis
print("=" * 70)
print("LABEL DISTRIBUTION ANALYSIS")
print("=" * 70)

# Get value counts
label_counts = df['type'].value_counts()
label_percentages = df['type'].value_counts(normalize=True) * 100

print("\nLabel counts:")
for label, count in label_counts.items():
    pct = label_percentages[label]
    print(f"  {label:15s}: {count:>10,} ({pct:>6.2f}%)")

print(f"\nTotal URLs: {len(df):,}")

In [None]:
# Create label distribution visualization
fig, ax = plt.subplots(figsize=(10, 6))

# Create bar chart
colors = ['#2ecc71', '#e74c3c', '#f39c12', '#9b59b6']
bars = ax.bar(label_counts.index, label_counts.values, color=colors, edgecolor='black')

# Add value labels on bars
for bar, count, pct in zip(bars, label_counts.values, label_percentages.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5000,
            f'{count:,}\n({pct:.1f}%)', ha='center', va='bottom', fontsize=10)

# Formatting
ax.set_xlabel('URL Category', fontsize=12)
ax.set_ylabel('Number of URLs', fontsize=12)
ax.set_title('Distribution of URL Categories in Malicious URLs Dataset\n(Siddhartha, 2024)', 
             fontsize=14, fontweight='bold')
ax.set_ylim(0, max(label_counts.values) * 1.15)

# Add grid
ax.yaxis.grid(True, linestyle='--', alpha=0.7)
ax.set_axisbelow(True)

plt.tight_layout()

# Save the figure
plt.savefig('../data/processed/01_label_distribution.png', dpi=300, bbox_inches='tight')
print("Figure saved to: data/processed/01_label_distribution.png")

plt.show()

## 5. URL Length Analysis

In [None]:
# Add URL length column for analysis
print("=" * 70)
print("URL LENGTH ANALYSIS")
print("=" * 70)

df['url_length'] = df['url'].apply(len)

print("\nURL Length Statistics:")
print(f"  Minimum length:  {df['url_length'].min():>10} characters")
print(f"  Maximum length:  {df['url_length'].max():>10} characters")
print(f"  Mean length:     {df['url_length'].mean():>10.2f} characters")
print(f"  Median length:   {df['url_length'].median():>10.2f} characters")
print(f"  Std deviation:   {df['url_length'].std():>10.2f} characters")

In [None]:
# Create URL length distribution visualization
fig, ax = plt.subplots(figsize=(12, 6))

# Create histogram
ax.hist(df['url_length'], bins=100, color='#3498db', edgecolor='black', alpha=0.7)

# Add vertical lines for mean and median
mean_len = df['url_length'].mean()
median_len = df['url_length'].median()

ax.axvline(mean_len, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_len:.1f}')
ax.axvline(median_len, color='green', linestyle='--', linewidth=2, label=f'Median: {median_len:.1f}')

# Formatting
ax.set_xlabel('URL Length (characters)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.set_title('Distribution of URL Lengths in Malicious URLs Dataset', fontsize=14, fontweight='bold')
ax.legend(fontsize=10)

# Add grid
ax.yaxis.grid(True, linestyle='--', alpha=0.7)
ax.set_axisbelow(True)

plt.tight_layout()

# Save the figure
plt.savefig('../data/processed/02_url_length_distribution.png', dpi=300, bbox_inches='tight')
print("Figure saved to: data/processed/02_url_length_distribution.png")

plt.show()

In [None]:
# URL length statistics by category
print("=" * 70)
print("URL LENGTH BY CATEGORY")
print("=" * 70)

length_by_type = df.groupby('type')['url_length'].agg(['mean', 'median', 'min', 'max', 'std'])
length_by_type = length_by_type.round(2)
print("\n")
print(length_by_type.to_string())

In [None]:
# Box plot of URL length by category
fig, ax = plt.subplots(figsize=(10, 6))

# Create box plot using seaborn for better styling
colors = ['#2ecc71', '#e74c3c', '#f39c12', '#9b59b6']
sns.boxplot(data=df, x='type', y='url_length', palette=colors, ax=ax)

# Formatting
ax.set_xlabel('URL Category', fontsize=12)
ax.set_ylabel('URL Length (characters)', fontsize=12)
ax.set_title('URL Length Distribution by Category', fontsize=14, fontweight='bold')

plt.tight_layout()

# Save the figure
plt.savefig('../data/processed/03_url_length_boxplot.png', dpi=300, bbox_inches='tight')
print("Figure saved to: data/processed/03_url_length_boxplot.png")

plt.show()

## 6. Summary and Key Findings

In [None]:
# Final EDA Summary
print("\n" + "=" * 70)
print("SPRINT 2 – EXPLORATORY DATA ANALYSIS SUMMARY")
print("=" * 70)

print(f"""
DATASET OVERVIEW:
  - Source: Malicious URLs Dataset (Siddhartha, 2024) from Kaggle
  - Total records: {len(df):,} URLs
  - Columns: {df.shape[1]} (url, type, url_length)
  - Missing values: {df.isnull().sum().sum()}

LABEL DISTRIBUTION:
""")
for label, count in df['type'].value_counts().items():
    pct = count / len(df) * 100
    print(f"  - {label}: {count:,} ({pct:.2f}%)")

print(f"""
URL LENGTH CHARACTERISTICS:
  - Minimum: {df['url_length'].min()} characters
  - Maximum: {df['url_length'].max()} characters
  - Mean: {df['url_length'].mean():.2f} characters
  - Median: {df['url_length'].median():.2f} characters

VISUALIZATIONS CREATED:
  1. data/processed/01_label_distribution.png
  2. data/processed/02_url_length_distribution.png
  3. data/processed/03_url_length_boxplot.png

KEY FINDINGS:
  1. Dataset is complete with no missing values
  2. Four URL categories: benign, phishing, defacement, malware
  3. URL lengths vary significantly, suggesting length could be a useful feature
  4. Different categories show different URL length patterns

NEXT STEPS (Sprint 3):
  - Convert multi-class labels to binary (benign vs malicious)
  - Clean and standardize URLs
  - Prepare data for feature engineering
""")
print("=" * 70)
print("Sprint 2 EDA Complete!")
print("=" * 70)

---

## End of Sprint 2 Notebook

**Next:** Sprint 3 - Data Cleaning and Label Binarization