In [1]:
! pip3 install requests python-dotenv zipfile36 python-magic



In [2]:
import os
import hashlib
import zipfile
import re
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
import requests
from pathlib import Path
import tempfile
import shutil
from IPython.display import display, HTML, Markdown
import warnings
warnings.filterwarnings('ignore')

print("📦 All packages imported successfully!")

📦 All packages imported successfully!


In [3]:
load_dotenv()

API_KEY = os.getenv("API_KEY")
if API_KEY is None:
    print("⚠️ Warning: API_KEY not found in .env file. MalwareBazaar queries will be limited.")
    print("💡 Create a .env file with: API_KEY=your_malwarebazaar_api_key")
else:
    print("✅ API key loaded successfully")

analysis_results = {
    'file_info': None,
    'hashes': {},
    'urls': [],
    'domains': [],
    'malicious_urls': [],
    'malicious_domains': [],
    'malware_found': False
}

print("🔧 Configuration complete!")

✅ API key loaded successfully
🔧 Configuration complete!


In [4]:
def get_apk_path_from_user():
    apk_path = input("📱 Enter the path to your APK file: ").strip()
    
    apk_path = apk_path.strip('"').strip("'")
    
    if not apk_path:
        raise ValueError("❌ Please enter a valid path")
        
    apk_path = Path(apk_path)
    
    if not apk_path.exists():
        raise FileNotFoundError(f"❌ File does not exist: {apk_path}")
        
    if not apk_path.suffix.lower() == '.apk':
        print("⚠️ Warning: File doesn't have .apk extension")
        
    return str(apk_path)

In [5]:
def calculate_file_hash(file_path, algorithm='sha256'):
    hash_obj = hashlib.new(algorithm)
    
    try:
        with open(file_path, 'rb') as file:
            for chunk in iter(lambda: file.read(4096), b""):
                hash_obj.update(chunk)
        return hash_obj.hexdigest()
    except Exception as e:
        print(f"❌ Error calculating {algorithm} hash: {e}")
        return None

In [6]:
def get_file_info(file_path):
    try:
        file_path = Path(file_path)
        file_size = file_path.stat().st_size
        file_name = file_path.name
        
        info = {
            'name': file_name,
            'size': file_size,
            'size_mb': file_size / (1024*1024),
            'path': str(file_path.absolute())
        }
        
        return info
    except Exception as e:
        print(f"❌ Error getting file info: {e}")
        return None

In [7]:
def extract_urls_from_apk(apk_path):
    print(f"🔍 Extracting URLs and endpoints from APK...")
    
    urls = set()
    domains = set()
    
    url_patterns = [
        r'https?://[^\s<>"\']+',
        r'http://[^\s<>"\']+',
        r'ws://[^\s<>"\']+',
        r'wss://[^\s<>"\']+',
        r'ftp://[^\s<>"\']+',
    ]
    
    domain_patterns = [
        r'[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?',
    ]
    
    api_patterns = [
        r'/api/[^\s<>"\']*',
        r'/v\d+/[^\s<>"\']*',
        r'/rest/[^\s<>"\']*',
        r'/graphql[^\s<>"\']*',
    ]
    
    try:
        with zipfile.ZipFile(apk_path, 'r') as apk_zip:
            file_list = apk_zip.namelist()
            files_to_check = []
            for file_name in file_list:
                if (file_name.endswith('.xml') or 
                    file_name.endswith('.json') or
                    file_name.endswith('.txt') or
                    file_name.endswith('.properties') or
                    'resources.arsc' in file_name or
                    file_name.endswith('.dex')):
                    files_to_check.append(file_name)
            
            print(f"📁 Examining {len(files_to_check)} files for URLs...")
            
            processed_files = 0
            for file_name in files_to_check:
                try:
                    with apk_zip.open(file_name) as file:
                        content = file.read()
                        text_content = ""
                        try:
                            text_content = content.decode('utf-8', errors='ignore')
                        except:
                            try:
                                text_content = content.decode('latin-1', errors='ignore')
                            except:
                                text_content = str(content)
                        for pattern in url_patterns:
                            matches = re.findall(pattern, text_content, re.IGNORECASE)
                            urls.update(matches)
                        for pattern in domain_patterns:
                            matches = re.findall(pattern, text_content, re.IGNORECASE)
                            for match in matches:
                                if ('.' in match and 
                                    not match.startswith('.') and
                                    not match.endswith('.') and
                                    len(match.split('.')) >= 2):
                                    domain = match.split('/')[0]
                                    if not any(char in domain for char in ['<', '>', '"', "'"]):
                                        domains.add(domain)
                        for pattern in api_patterns:
                            matches = re.findall(pattern, text_content, re.IGNORECASE)
                            for match in matches:
                                urls.add(match)
                    processed_files += 1
                    if processed_files % 10 == 0:
                        print(f"   Processed {processed_files}/{len(files_to_check)} files...")
                                
                except Exception as e:
                    continue
            cleaned_urls = set()
            for url in urls:
                url = url.strip('",\'();[]{}')
                url = re.sub(r'["\'>].*$', '', url)
                
                if (len(url) > 10 and 
                    not url.endswith(('.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.css', '.js')) and
                    '.' in url):
                    cleaned_urls.add(url)
            
            cleaned_domains = set()
            for domain in domains:
                domain = domain.strip('",\'();[]{}')
                if (len(domain) > 3 and 
                    '.' in domain and
                    not any(ext in domain.lower() for ext in ['.png', '.jpg', '.jpeg', '.gif', '.css', '.js', '.xml'])):
                    cleaned_domains.add(domain)
            
            print(f"✅ Extraction complete! Found {len(cleaned_urls)} URLs and {len(cleaned_domains)} domains")
            return list(cleaned_urls), list(cleaned_domains)
            
    except Exception as e:
        print(f"❌ Error extracting URLs from APK: {e}")
        return [], []

print("🔗 URL extraction functions loaded!")

🔗 URL extraction functions loaded!


In [8]:
def check_hash_in_malwarebazaar(hash_to_check):
    if API_KEY is None:
        print("⚠️ No API key found. Skipping MalwareBazaar check.")
        return False
        
    url = "https://mb-api.abuse.ch/api/v1/"
    headers = {
        "Auth-Key": API_KEY.strip(),  # Clean any whitespace
        "Content-Type": "application/x-www-form-urlencoded",
    }
    data = {
        "query": "get_info",
        "hash": hash_to_check
    }
    
    print(f"🔍 Checking hash in MalwareBazaar...")
    
    try:
        response = requests.post(url, headers=headers, data=data, timeout=30)
        
        if response.status_code != 200:
            print(f"❌ API request failed with status code: {response.status_code}")
            return False
        
        json_data = response.json()
        
        # Check if query_status is "ok" - this means the hash was found (MALICIOUS)
        if json_data.get("query_status") == "ok":
            print("⚠️ ALERT: Hash FOUND in MalwareBazaar!")
            print("="*50)
            
            # Get the data array
            data_array = json_data.get("data")
            if data_array and len(data_array) > 0:
                # Process the first entry (there should typically be only one)
                entry = data_array[0]
                print(f"📦 File size: {entry.get('file_size', 'N/A')} bytes")
                print(f"📦 File name: {entry.get('file_name', 'N/A')}")
                print(f"📦 Origin country: {entry.get('origin_country', 'N/A')}")
                print(f"📦 First seen: {entry.get('first_seen', 'N/A')}")
                print(f"📦 Last seen: {entry.get('last_seen', 'N/A')}")
                print(f"📦 Malware family: {entry.get('malware_family', 'N/A')}")
                print(f"📦 Signature: {entry.get('signature', 'N/A')}")
                print(f"📦 Tags: {entry.get('tags', 'N/A')}")
                print(f"📦 File type: {entry.get('file_type', 'N/A')}")
                print(f"📦 Reporter: {entry.get('reporter', 'N/A')}")
                
                # Show vendor intelligence if available
                vendor_intel = entry.get('vendor_intel', {})
                if vendor_intel:
                    print(f"📦 Vendor Intelligence:")
                    for vendor, info in vendor_intel.items():
                        if isinstance(info, dict):
                            verdict = info.get('verdict', info.get('status', 'N/A'))
                            print(f"   • {vendor}: {verdict}")
                        elif isinstance(info, list) and info:
                            print(f"   • {vendor}: {info[0].get('detection', 'N/A')}")
                
                print("="*50)
                
                # Set the global flag
                analysis_results['malware_found'] = True
                return True
            else:
                print("⚠️ Hash found but no detailed data available")
                analysis_results['malware_found'] = True
                return True
        
        # If query_status is not "ok", check for specific error messages
        elif json_data.get("query_status") == "no_results":
            print("✅ Hash NOT found in MalwareBazaar (Good - not known malware)")
            return False
        elif json_data.get("query_status") == "wrong_auth_key":
            print("❌ Authentication failed - wrong API key")
            print("💡 Please check your .env file and make sure API_KEY is correct")
            return False
        elif json_data.get("query_status") == "illegal_hash":
            print("❌ Invalid hash format provided")
            return False
        else:
            # Handle other possible statuses
            query_status = json_data.get("query_status", "unknown")
            print(f"⚠️ Unexpected query status: {query_status}")
            return False
        
    except requests.exceptions.RequestException as e:
        print(f"❌ Network error: {e}")
        return False
    except Exception as e:
        print(f"❌ Error checking MalwareBazaar: {e}")
        return False

In [9]:
def check_url_in_urlhaus(url_to_check):
    api_url = "https://urlhaus-api.abuse.ch/v1/url/"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
    }
    data = {
        "url": url_to_check
    }
    
    try:
        response = requests.post(api_url, headers=headers, data=data, timeout=10)
        if response.status_code != 200:
            return False, f"API request failed with status code: {response.status_code}"
        
        json_data = response.json()
        if json_data.get("query_status", "").lower() == "ok":
            return True, {
                "id": json_data.get("id", "N/A"),
                "url_status": json_data.get("url_status", "N/A"),
                "host": json_data.get("host", "N/A"),
                "date_added": json_data.get("date_added", "N/A"),
                "last_online": json_data.get("last_online", "N/A"),
                "threat": json_data.get("threat", "N/A"),
                "tags": json_data.get("tags", "N/A")
            }
        else:
            return False, "URL not found in database"
            
    except Exception as e:
        return False, f"Error: {e}"

In [10]:
def analyze_apk_file():
    global analysis_results
    
    print("🚀 APK Security Analyzer")
    print("="*60)
    
    try:
        apk_path = get_apk_path_from_user()
        
        print(f"\n📱 Getting file information...")
        file_info = get_file_info(apk_path)
        if not file_info:
            return
        
        analysis_results['file_info'] = file_info
        
        print(f"📁 File: {file_info['name']}")
        print(f"📏 Size: {file_info['size']:,} bytes ({file_info['size_mb']:.2f} MB)")
        print(f"📍 Path: {file_info['path']}")
        
        print(f"\n🔐 Calculating file hashes...")
        
        hash_algorithms = ['sha256', 'md5', 'sha1']
        for algorithm in hash_algorithms:
            hash_value = calculate_file_hash(apk_path, algorithm)
            if hash_value:
                analysis_results['hashes'][algorithm] = hash_value
                print(f"📋 {algorithm.upper()}: {hash_value}")
        
        if not analysis_results['hashes']:
            print("❌ Failed to calculate any hashes")
            return
        
        if 'sha256' in analysis_results['hashes']:
            check_hash_in_malwarebazaar(analysis_results['hashes']['sha256'])
        
        print(f"\n🔗 Extracting URLs and endpoints...")
        urls, domains = extract_urls_from_apk(apk_path)
        
        analysis_results['urls'] = urls
        analysis_results['domains'] = domains
        
        print(f"✅ Analysis preparation complete!")
        
    except Exception as e:
        print(f"❌ Error during analysis: {e}")
        return None
    
    return apk_path

In [11]:
apk_file_path = analyze_apk_file()

if apk_file_path:
    print("\n" + "="*60)
    print("📊 ANALYSIS SUMMARY")
    print("="*60)
    
    if analysis_results['file_info']:
        info = analysis_results['file_info']
        print(f"📱 File: {info['name']}")
        print(f"📏 Size: {info['size']:,} bytes ({info['size_mb']:.2f} MB)")
    
    print(f"\n🔐 File Hashes:")
    for algorithm, hash_value in analysis_results['hashes'].items():
        print(f"   {algorithm.upper()}: {hash_value}")
    
    print(f"\n🔗 Extracted Data:")
    print(f"   URLs found: {len(analysis_results['urls'])}")
    print(f"   Domains found: {len(analysis_results['domains'])}")
    
    if analysis_results['malware_found']:
        print(f"\n⚠️ SECURITY ALERT: This APK is flagged as malware!")
    else:
        print(f"\n✅ APK hash not found in malware database")

🚀 APK Security Analyzer

📱 Getting file information...
📁 File: GlamLive.apk
📏 Size: 11,908,947 bytes (11.36 MB)
📍 Path: /Users/lu77_u/Documents/Git/Xandronyx.ML/Hash and Url Checker/Sample APKs/GlamLive.apk

🔐 Calculating file hashes...
📋 SHA256: a3abf59afe735fec1b2b1599ea411906674a58df563d8ebc779e8127a1b05795
📋 MD5: 8dc9371887cc615ce57c46e5994a3328
📋 SHA1: 5f621f706272583e615eb0472b59a519b81c1396
🔍 Checking hash in MalwareBazaar...
⚠️ ALERT: Hash FOUND in MalwareBazaar!
📦 File size: 11908947 bytes
📦 File name: GlamLive.apk
📦 Origin country: IN
📦 First seen: 2025-06-07 16:04:18
📦 Last seen: None
📦 Malware family: N/A
📦 Signature: None
📦 Tags: ['android', 'apk', 'gamelive', 'malware', 'signed']
📦 File type: apk
📦 Reporter: mohit
📦 Vendor Intelligence:
   • InQuest: MALICIOUS
   • ReversingLabs: SUSPICIOUS
   • Spamhaus_HBL: suspicious

🔗 Extracting URLs and endpoints...
🔍 Extracting URLs and endpoints from APK...
📁 Examining 633 files for URLs...
   Processed 10/633 files...
   Processe

In [12]:
def check_extracted_urls():
    if not analysis_results['urls'] and not analysis_results['domains']:
        print("❌ No URLs or domains to check")
        return
    
    print(f"🔍 Checking URLs and domains against URLhaus database...")
    print("="*60)
    
    malicious_urls = []
    malicious_domains = []
    
    if analysis_results['urls']:
        print(f"🔗 Checking {len(analysis_results['urls'])} URLs...")
        
        urls_to_check = analysis_results['urls'][:20]
        
        for i, url in enumerate(urls_to_check, 1):
            print(f"[{i}/{len(urls_to_check)}] Checking: {url[:60]}...")
            
            is_malicious, result = check_url_in_urlhaus(url)
            if is_malicious:
                malicious_urls.append((url, result))
                print(f"   ⚠️ MALICIOUS: {url}")
            else:
                print(f"   ✅ Clean: {url[:40]}...")
    
    if analysis_results['domains']:
        print(f"\n🌐 Checking {len(analysis_results['domains'])} domains...")
        
        domains_to_check = analysis_results['domains'][:15]
        
        for i, domain in enumerate(domains_to_check, 1):
            test_url = f"http://{domain}"
            print(f"[{i}/{len(domains_to_check)}] Checking: {domain}...")
            
            is_malicious, result = check_url_in_urlhaus(test_url)
            if is_malicious:
                malicious_domains.append((domain, result))
                print(f"   ⚠️ MALICIOUS: {domain}")
            else:
                print(f"   ✅ Clean: {domain}")
    
    analysis_results['malicious_urls'] = malicious_urls
    analysis_results['malicious_domains'] = malicious_domains
    
    return malicious_urls, malicious_domains

if analysis_results['urls'] or analysis_results['domains']:
    malicious_urls, malicious_domains = check_extracted_urls()
else:
    print("⚠️ No URLs or domains found in APK to check")
    malicious_urls, malicious_domains = [], []

🔍 Checking URLs and domains against URLhaus database...
🔗 Checking 285 URLs...
[1/20] Checking: http://schemas.android.com/apk/res-auto **http://schemas.and...
   ✅ Clean: http://schemas.android.com/apk/res-auto ...
[2/20] Checking: http://schemas.android.com/apk/res/android path vector  ...
   ✅ Clean: http://schemas.android.com/apk/res/andro...
[3/20] Checking: http://schemas.android.com/apk/res/android   D   4   ...
   ✅ Clean: http://schemas.android.com/apk/res/andro...
[4/20] Checking: http://schemas.android.com/apk/res/android  8   4   ...
   ✅ Clean: http://schemas.android.com/apk/res/andro...
[5/20] Checking: http://schemas.android.com/apk/res/android  ...
   ✅ Clean: http://schemas.android.com/apk/res/andro...
[6/20] Checking: http://schemas.android.com/apk/res/android        ...
   ✅ Clean: http://schemas.android.com/apk/res/andro...
[7/20] Checking: http://schemas.android.com/aapt **http://schemas.android.com...
   ✅ Clean: http://schemas.android.com/aapt

In [13]:
def display_detailed_results():
    """Display comprehensive analysis results"""
    print("📊 DETAILED SECURITY ANALYSIS RESULTS")
    print("="*80)
    
    if analysis_results['malicious_urls']:
        print(f"\n⚠️ MALICIOUS URLs FOUND ({len(analysis_results['malicious_urls'])}):")
        print("="*50)
        for url, details in analysis_results['malicious_urls']:
            print(f"\n🚨 URL: {url}")
            print(f"   🆔 ID: {details['id']}")
            print(f"   📌 Status: {details['url_status']}")
            print(f"   🌐 Host: {details['host']}")
            print(f"   📅 Date Added: {details['date_added']}")
            print(f"   ⚠️ Threat: {details['threat']}")
            print(f"   🏷️ Tags: {details['tags']}")
    
    if analysis_results['malicious_domains']:
        print(f"\n⚠️ MALICIOUS DOMAINS FOUND ({len(analysis_results['malicious_domains'])}):")
        print("="*50)
        for domain, details in analysis_results['malicious_domains']:
            print(f"\n🚨 DOMAIN: {domain}")
            print(f"   🆔 ID: {details['id']}")
            print(f"   📌 Status: {details['url_status']}")
            print(f"   ⚠️ Threat: {details['threat']}")
            print(f"   🏷️ Tags: {details['tags']}")
    
    if not analysis_results['malicious_urls'] and not analysis_results['malicious_domains']:
        print("\n✅ NO MALICIOUS URLs OR DOMAINS FOUND!")
        print("   All extracted URLs and domains appear to be clean.")
    
    print(f"\n📋 EXTRACTED URLs (showing first 10 of {len(analysis_results['urls'])}):")
    for i, url in enumerate(analysis_results['urls'][:10], 1):
        print(f"   {i}. {url}")
    if len(analysis_results['urls']) > 10:
        print(f"   ... and {len(analysis_results['urls']) - 10} more URLs")
    
    print(f"\n📋 EXTRACTED DOMAINS (showing first 10 of {len(analysis_results['domains'])}):")
    for i, domain in enumerate(analysis_results['domains'][:10], 1):
        print(f"   {i}. {domain}")
    if len(analysis_results['domains']) > 10:
        print(f"   ... and {len(analysis_results['domains']) - 10} more domains")

display_detailed_results()

📊 DETAILED SECURITY ANALYSIS RESULTS

✅ NO MALICIOUS URLs OR DOMAINS FOUND!
   All extracted URLs and domains appear to be clean.

📋 EXTRACTED URLs (showing first 10 of 285):
   1. http://schemas.android.com/apk/res-auto **http://schemas.android.com/apk/res/android merge skip     0        Y_rvt        
   2. http://schemas.android.com/apk/res/android path vector    $   UY          
   3. http://schemas.android.com/apk/res/android   D   4          ]               L                                    
   4. http://schemas.android.com/apk/res/android  8   4         ?]                     
   5. http://schemas.android.com/apk/res/android  
   6. http://schemas.android.com/apk/res/android        
   7. http://schemas.android.com/aapt **http://schemas.android.com/apk/res/android target                       

In [14]:
def generate_security_summary():
    print("🛡️ SECURITY SUMMARY & RECOMMENDATIONS")
    print("="*80)
    
    risk_level = "LOW"
    risk_factors = []
    
    if analysis_results['malware_found']:
        risk_level = "CRITICAL"
        risk_factors.append("APK hash found in malware database")
    
    if analysis_results['malicious_urls']:
        if risk_level == "LOW":
            risk_level = "HIGH"
        risk_factors.append(f"{len(analysis_results['malicious_urls'])} malicious URLs found")
    
    if analysis_results['malicious_domains']:
        if risk_level == "LOW":
            risk_level = "MEDIUM"
        risk_factors.append(f"{len(analysis_results['malicious_domains'])} malicious domains found")
    
    if risk_level == "CRITICAL":
        print("🚨 RISK LEVEL: CRITICAL")
        print("   ⛔ DO NOT INSTALL THIS APK!")
    elif risk_level == "HIGH":
        print("⚠️ RISK LEVEL: HIGH")
        print("   🔴 Strongly recommend NOT installing this APK")
    elif risk_level == "MEDIUM":
        print("⚠️ RISK LEVEL: MEDIUM")
        print("   🟡 Exercise caution before installing")
    else:
        print("✅ RISK LEVEL: LOW")
        print("   🟢 APK appears to be clean")
    
    if risk_factors:
        print(f"\n📋 Risk Factors:")
        for factor in risk_factors:
            print(f"   • {factor}")
    
    print(f"\n📊 ANALYSIS STATISTICS:")
    print(f"   • Total URLs extracted: {len(analysis_results['urls'])}")
    print(f"   • Total domains extracted: {len(analysis_results['domains'])}")
    print(f"   • Malicious URLs found: {len(analysis_results['malicious_urls'])}")
    print(f"   • Malicious domains found: {len(analysis_results['malicious_domains'])}")
    print(f"   • Malware signature match: {'YES' if analysis_results['malware_found'] else 'NO'}")
    
    print(f"\n✅ Analysis completed successfully!")
    print("="*80)

generate_security_summary()

🛡️ SECURITY SUMMARY & RECOMMENDATIONS
🚨 RISK LEVEL: CRITICAL
   ⛔ DO NOT INSTALL THIS APK!

📋 Risk Factors:
   • APK hash found in malware database

📊 ANALYSIS STATISTICS:
   • Total URLs extracted: 285
   • Total domains extracted: 191
   • Malicious URLs found: 0
   • Malicious domains found: 0
   • Malware signature match: YES

✅ Analysis completed successfully!
