In [1]:
from collections import Counter
from collections import defaultdict
import json
import magic
import matplotlib
import matplotlib.pyplot as plt
import os
import pandas as pd
from pathlib import Path
import pickle
import subprocess
import tarfile
import timeit
import time
from tqdm import tqdm
from typing import Dict, List
from yara_scanner import YaraScanner

%load_ext memory_profiler

In [2]:
# Initialize and load pypi malware checks rules
pypi_malware_checks_rule_path = os.path.abspath("../scanners/pypi-malware-checks/setup_py_rules.yara")
scanner = YaraScanner()
scanner.track_yara_file(pypi_malware_checks_rule_path)
scanner.load_rules()

True

In [3]:
# This function is used to calculate runtimes of PyPI malware checks
def calculate_runtimes(package_releases_path):
    package_dirs = [f.path for f in os.scandir(package_releases_path) if f.is_dir()]
    runtimes = {}
    for package_dir in tqdm(package_dirs):
        package_name = package_dir.split("/")[-1]
        start = time.time()
        subprocess.call(
            ["scan", "-r", package_dir, "-y", "../scanners/pypi-malware-checks/setup_py_rules.yara"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT)
        runtimes[package_name] = time.time() - start
    return runtimes

In [4]:
# Scanning packages using PyPI
def scanning_packages(package_releases_path) -> Dict:
    results = defaultdict(list)
    for subdir, dirs, files in tqdm(os.walk(package_releases_path)):
        for file in files:
            # Here I made an assumpution of the file path; e.g., I stored the bad-snakes repo under my home directory 
            try:
                package_name = f"{subdir.split('/')[6]}"
            except IndexError:
                import pdb; pdb.set_trace()
            file_path = subdir + os.sep + file
            # Only scan Python files to reduce noises, there are packages that don't have Python files at all
            if Path(file_path).suffix == '.py':
                try:
                    scanner.scan(file_path)
                except Exception as e:
                    # there are cases where .py files containing binary data that yara-scanner cannot exclude
                    print(package_name, file_path, e)
                else:
                    scan_results = scanner.scan_results
                    if scan_results:
                        for i in scan_results:
                            results[package_name].append(i)
                    else:
                        # if there are no alerts 
                        results[package_name].append({"target": file, 'rule': ""})
    return results      

In [None]:
# parse the results to collect number of rules of the packages
def parse_analysis_results(packages_scanning_results: Dict[str, str]) -> List[str]:
    for package, files in packages_scanning_results.items():
        for file_ in files: 
            if file_['rule']:
                yield (package, file_["target"], file_['rule'], 1)
            else:
                yield (package, file_["target"], "", 0)

In [None]:
# Collecting subpatterns of the packages
def collect_subpatterns(packages_scanning_results: Dict[str, str]) -> List[str]:
    for package, files in packages_scanning_results.items():
        for file_ in files: 
            if 'strings' in file_:
                for line in file_['strings']:
                    #line[0] -> line number, line[1] -> subpattern, line[2] -> line content 
                    yield (package, file_["target"], file_['rule'], line[1])              

## Analyzing Malicious packages

In [None]:
# Path to the malicious packages dataset
malicious_packages_path = os.path.abspath("../dataset/malicious-packages/")

In [None]:
malicious_running_times = computing_runtimes(malicious_packages_path)

In [None]:
malicious_packages_running_times_df = pd.DataFrame(list(malicious_running_times.items()), columns=['package', 'running_time'])
# Savign the runtime info
malicious_packages_running_times_df.to_csv("../results/running_times/pypi-malware-checks/malicious-packages.csv")
malicious_packages_running_times_df["running_time"].describe()

In [None]:
# Uncomment to re-run the scanning, this would take a while
malicious_packages_scanning_results = scanning_packages(malicious_packages_path)

In [None]:
# Loading the existing result in case we do not want to rescan the packages
with open(os.path.abspath(os.path.join("..", "results", "pypi-malware-checks", "malicious_packages.pkl")), 'rb') as fp:
    malicious_packages_scanning_results = pickle.load(fp)

### Triggered rules and subpatterns in all Python files in malicious packages

In [None]:
# Collect the alerts and transform them into datafarame
malicious_packages_rules = parse_analysis_results(malicious_packages_scanning_results)
malicious_packages_rules_df = pd.DataFrame(malicious_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [None]:
# Number of rules per package
malicious_packages_rules_groupby = malicious_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_rules_groupby.sum().sum()}")
malicious_packages_rules_groupby.sum().describe()

In [None]:
# packages that triggered at least one alert
malicious_packages_has_rules_df = malicious_packages_rules_df[malicious_packages_rules_df['has_rule'] != 0]

In [None]:
# Number of rules per package
malicious_packages_has_rules_groupby = malicious_packages_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_has_rules_groupby.sum().sum()}")
malicious_packages_has_rules_groupby.sum().describe()

In [None]:
# Collect the alerts and transform them into datafarame
malicious_packages_subpatterns = collect_subpatterns(malicious_packages_scanning_results)
malicious_packages_subpatterns_df = pd.DataFrame(malicious_packages_subpatterns, columns=["package", "target", "rule", "subpattern"])

In [None]:
# Number of subpatterns per package
malicious_packages_subpatterns_groupby = malicious_packages_subpatterns_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {malicious_packages_subpatterns_groupby.count().sum()}")
malicious_packages_subpatterns_groupby.count().describe()

### Triggered rules and subpatterns in all setup.py files in malicious packages

In [None]:
# Selecting only setup.py files
malicious_packages_rules_setup_df = malicious_packages_rules_df[malicious_packages_rules_df['target'].str.contains('setup.py')]

In [None]:
# Number of rules per package
malicious_packages_rules_setup_groupby = malicious_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_rules_setup_groupby.sum().sum()}")
malicious_packages_rules_setup_groupby.sum().describe()

In [None]:
# packages whose setup.py and has at least one alert
malicious_packages_setup_has_rules_df = malicious_packages_rules_setup_df[malicious_packages_rules_setup_df['has_rule'] != 0]

In [None]:
# Number of rules per package
malicious_packages_setup_has_rules_groupby = malicious_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_setup_has_rules_groupby.sum().sum()}")
malicious_packages_setup_has_rules_groupby.sum().describe()

In [None]:
malicious_packages_subpatterns_setup_df = malicious_packages_subpatterns_df[malicious_packages_subpatterns_df['target'].str.contains('setup.py')]

## Analyzing Popular packages

In [None]:
# Path to the popular packages dataset
popular_packages_path = os.path.abspath("../dataset/popular-packages")

In [None]:
popular_running_times = computing_runtimes(popular_packages_path)

In [None]:
popular_packages_running_times_df = pd.DataFrame(list(popular_running_times.items()), columns=['package', 'running_time'])
popular_packages_running_times_df["running_time"].describe()

In [None]:
popular_packages_running_times_df.to_csv("../results/running_times/pypi-malware-checks/popular-packages.csv")

In [None]:
# Loading the existing result in case we do not want to rescan the packages
with open(os.path.abspath(os.path.join("..", "results", "pypi-malware-checks", "popular_packages.pkl")), 'rb') as fp:
    popular_packages_scanning_results = pickle.load(fp)

### Triggered rules and subpatterns in all Python files in popular packages

In [None]:
# Collect the alerts and transform them into datafarame
popular_packages_rules = parse_analysis_results(popular_packages_scanning_results)
popular_packages_rules_df = pd.DataFrame(popular_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [None]:
# Number of rules per package
popular_packages_rules_groupby = popular_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_rules_groupby.sum().sum()}")
popular_packages_rules_groupby.sum().describe()

In [None]:
# popular packages that have at least one alert
popular_packages_has_rules_df = popular_packages_rules_df[popular_packages_rules_df['has_rule'] != 0]

In [None]:
# Number of rules per package
popular_packages_has_rules_groupby = popular_packages_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_has_rules_groupby.sum().sum()}")
popular_packages_has_rules_groupby.sum().describe()

In [None]:
# Collect the alerts and transform them into datafarame
popular_packages_subpatterns = collect_subpatterns(popular_packages_scanning_results)
popular_packages_subpatterns_df = pd.DataFrame(popular_packages_subpatterns, columns=["package", "target", "rule", "subpattern"])

In [None]:
# Number of subpatterns per package
popular_packages_subpatterns_groupby = popular_packages_subpatterns_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {popular_packages_subpatterns_groupby.count().sum()}")
popular_packages_subpatterns_groupby.count().describe()

### Triggered rules and subpatterns in all setup.py files in popular packages

In [None]:
# Selecting only setup.py files
popular_packages_rules_setup_df = popular_packages_rules_df[popular_packages_rules_df['target'].str.contains('setup.py')]

In [None]:
# Number of rules per package
popular_packages_rules_setup_groupby = popular_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_rules_setup_groupby.sum().sum()}")
popular_packages_rules_setup_groupby.sum().describe()

In [None]:
# popular packages whose setup.py files that have at least one alert
popular_packages_setup_has_rules_df = popular_packages_rules_setup_df[popular_packages_rules_setup_df['has_rule'] != 0]

In [None]:
# Number of rules per package
popular_packages_setup_has_rules_groupby = popular_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_setup_has_rules_groupby.sum().sum()}")
popular_packages_setup_has_rules_groupby.sum().describe()

In [None]:
# Ratio of true positives to false postives 
thresholds = [1, 2, 3, 4, 5]
scores = popular_packages_setup_has_rules_groupby.sum().to_list()
thesholds_tpr_fpr_ratio = []
for t in thresholds:
    tp, fp = get_tpr_fpr(scores, t)
    print(t, tp, fp)

## Analyzing Random packages

In [None]:
random_packages_path = os.path.abspath("../dataset/random-packages/")

In [None]:
random_running_times = computing_runtimes(random_packages_path)
random_packages_running_times_df = pd.DataFrame(list(random_running_times.items()), columns=['package', 'running_time'])

In [None]:
random_packages_running_times_df.to_csv("../results/running_times/pypi-malware-checks/random-packages.csv")

In [None]:
random_packages_running_times_df["running_time"].describe()

In [None]:
# Uncomment to re-run the scanning, this would take a while
random_packages_scanning_results = scanning_releases(random_pypi_packages)

### Triggered rules and subpatterns in all Python files in random packages

In [None]:
# Loading the existing result
with open(os.path.abspath(os.path.join("..", "results", "pypi-malware-checks", "random_packages.pkl")), 'rb') as fp:
    random_packages_scanning_results = pickle.load(fp)

In [None]:
# Collect the alerts and transform them into datafarame
random_packages_rules = parse_analysis_results(random_packages_scanning_results)
random_packages_rules_df = pd.DataFrame(random_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [None]:
# Number of rules per package
random_packages_rules_groupby = random_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_groupby.sum().sum()}")
random_packages_rules_groupby.sum().describe()

In [None]:
# packages that have at least one alert
random_packages_has_rules_df = random_packages_rules_df[random_packages_rules_df['has_rule'] != 0]

In [None]:
# Number of rules per package
random_packages_has_rules_groupby = random_packages_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_has_rules_groupby.sum().sum()}")
random_packages_has_rules_groupby.sum().describe()

In [None]:
# Collect the alerts and transform them into datafarame
random_packages_subpatterns = collect_subpatterns(random_packages_scanning_results)
random_packages_subpatterns_df = pd.DataFrame(random_packages_subpatterns, columns=["package", "target", "rule", "subpattern"])

In [None]:
# Number of subpatterns per package
random_packages_subpatterns_groupby = random_packages_subpatterns_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {random_packages_subpatterns_groupby.count().sum()}")
random_packages_subpatterns_groupby.count().describe()

### Triggered rules and subpatterns in all setup.py files in random packages

In [None]:
# Selecting only setup.py files
random_packages_rules_setup_df = random_packages_rules_df[random_packages_rules_df['target'].str.contains('setup.py')]

In [None]:
# Number of rules per package
random_packages_rules_setup_groupby = random_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_setup_groupby.sum().sum()}")
random_packages_rules_setup_groupby.sum().describe()

In [None]:
random_packages_setup_has_rules_df = random_packages_rules_setup_df[random_packages_rules_setup_df['has_rule'] != 0]

In [None]:
# Number of rules per package
random_packages_rules_setup_has_rules_groupby = random_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_setup_has_rules_groupby.sum().sum()}")
random_packages_rules_setup_has_rules_groupby.sum().describe()

In [None]:
random_packages_subpatterns_setup_df = random_packages_subpatterns_df[random_packages_subpatterns_df['target'].str.contains('setup.py')]

In [None]:
# Number of subpatterns per package
random_packages_subpatterns_setup_groupby = random_packages_subpatterns_setup_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {random_packages_subpatterns_setup_groupby.count().sum()}")
random_packages_subpatterns_setup_groupby.count().describe()

In [None]:
# Saving the raw result for faster analaysis next time
with open(os.path.abspath("../results/pypi-malware-checks/random_packages_scanning_results_pypi_malwarechecks.pkl"), 'wb') as fp:
    pickle.dump(random_packages_scanning_results, fp, protocol=pickle.HIGHEST_PROTOCOL)