In [1]:
import os
# move to project root
os.chdir('/home/rcgonzal/DSC180Malware/m2v-adversarial-hindroid/')

import pandas as pd
import numpy as np

from src.model.model import M2VDroid
from src.model.hindroid import Hindroid
from src.data.hindroid_etl import make_models
from src.analysis.analysis import create_performance_table
from src.utils import find_apps

%load_ext autoreload
%autoreload 2

# Purpose
This notebook should guide a user with some detail in how to use this package. Note: all paths should be relative to the project directory unless of course the root indicator is present i.e. `/`.

# Data Selection
We assume you should have access to Android apps already decompiled into their Smali representations. If you have not done this, please look into how to use Apktool and Smali to decompile Android APKs (We may provide a script in the future). What we do provide is the `find_app` function which, given a directory, will recursively look for decompiled apps and return a DataFrame with their locations. This is how the `app_list.csv` file begins. 

In [2]:
find_apps('test/testdata/')

Unnamed: 0_level_0,app_dir
app,Unnamed: 1_level_1
testapp1,test/testdata/testapp1
testapp2,test/testdata/testapp2


In some cases like the file `data/out/all-apps/app_list.csv`, we add more columns to this table such as what category an app is from and whether is it malware or not in order for us to label our examples.

In [3]:
all_apps = pd.read_csv('data/out/all-apps/app_list.csv', dtype=str, index_col='app')
all_apps

Unnamed: 0_level_0,app_dir,category,malware
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
com.kaktus.hyungkaktus,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
com.wedup.duduamzaleg,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
com.dublin_mobile123.cheat_gta_5,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
com.appall.optimizationbox,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
live.wallpaper.t910001560,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
...,...,...,...
com.nytimes.android,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps,0
com.tinytouchtales.alchi,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps,0
com.mycelium.wallet,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps,0
com.aceviral.smashycity,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps,0


**Aside:** `all-apps` is a special folder in our out project because it houses all apps -- and their API data in `app-data`! When parsed in our ETL, each app is extracted into its own `.csv` containing every API call made within it, making it easy to pick and choose which apps we want to select just by knowing their names (or md5s for malware). 

With that said, let's return to selecting our data. We want to split our data into stratified halves, both with equal amounts of benign apps and malware. We also have a category `random-apps` which we do not know the label to and must drop from our dataset.

In [4]:
all_apps = all_apps[all_apps.category != 'random-apps']
training_sample = (
    all_apps.groupby('malware')
    .apply(lambda x: x.sample(frac=0.5, random_state=42)) # perform stratified sample
    .drop(columns='malware').reset_index().drop(columns='malware').set_index('app') # reset the index
)
training_sample

Unnamed: 0_level_0,app_dir,category
app,Unnamed: 1_level_1,Unnamed: 2_level_1
com.hcg.cok.gp,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.glu.wrestling,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.tmusic.christmassongs,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.han.dominoes,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.jetappfactory.jetaudio,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
...,...,...
7280d6d74716513369c3a8b8f1d94676,/teams/DSC180A_FA20_A00/a04malware/malware/Ban...,malware
5d59c7c74c7133d94b8a257d749c823a,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
3a54c9c23e49c0c67185d22ad2cbfc58,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
ba6633d214a4e85cb157acb8da9054c1,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware


In [5]:
testing_sample = all_apps[['app_dir', 'category']].loc[all_apps.index.difference(training_sample.index)]
testing_sample

Unnamed: 0_level_0,app_dir,category
app,Unnamed: 1_level_1,Unnamed: 2_level_1
00268453be254779f0c7590de47db944,/teams/DSC180A_FA20_A00/a04malware/malware/Dro...,malware
002a7270ec52ec68ea3d979c85261308,/teams/DSC180A_FA20_A00/a04malware/malware/Ban...,malware
0030e0003b7226e9142683e49b41a423,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
00335946abb79777f9fe2d0d96651e03,/teams/DSC180A_FA20_A00/a04malware/malware/Vid...,malware
0038be31cfed95e13a33d87142eada70,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
...,...,...
org.edx.mobile,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
org.mozilla.firefox,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
org.videolan.vlc,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
pps.christmas.photo.frames,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps


In [6]:
# create two separate directories for each sample and save both to their respective directory
os.makedirs('data/out/train-half', exist_ok=True)
os.makedirs('data/out/test-half', exist_ok=True)
training_sample.to_csv('data/out/train-half/app_list.csv')
testing_sample.to_csv('data/out/test-half/app_list.csv')

Now we must train a model on the training set. To do that we must run the ETL pipeline on that directory. Therefore we set `config/etl-params/etl-params.json` as shown below and then execute `python run.py data`. *This may take a few hours run especially the random walks!*

```json
{
    "outfolder": "data/out/train-half",
    "parse_params": {
        "nprocs": 16
    },
    "feature_params": {
        "redo": false,
        "walk_args": {
            "nprocs": 16,
            "length": 60,
            "n": 3,
            "metapaths": [
                ["app", "api", "app"],
                ["app", "api", "method", "api", "app"],
                ["app", "api", "package", "api", "app"],
                ["app", "api", "package", "api", "method", "api", "app"],
                ["app", "api", "method", "api", "package", "api", "app"]
            ]
        },
        "w2v_args": {
            "size": 128,
            "window": 7,
            "min_count": 0,
            "negative": 5,
            "sg": 1,
            "workers": 16,
            "iter": 5
        }
    },
    "hindroid_params": {
        "redo": false
    }
}
```

In [14]:
%time !python run.py data

2021-02-16 01:24:54.385458: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-02-16 01:24:54.385500: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-02-16 01:24:56.891521: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-02-16 01:24:56.894470: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-02-16 01:24:56.923293: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:61:00.0 name: GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.582GHz coreCount: 28 deviceMemorySize: 10.92GiB deviceMemoryBandwidt

In [7]:
%time make_models('data/out/train-half/')

Fitting models:


  0%|          | 0/30 [00:00<?, ?it/s]

	Fitting AAT model...


100%|██████████| 30/30 [00:28<00:00,  1.07it/s]
  0%|          | 0/30 [00:00<?, ?it/s]

	Fitting ABAT model...


100%|██████████| 30/30 [06:26<00:00, 12.88s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

	Fitting APAT model...


100%|██████████| 30/30 [00:50<00:00,  1.70s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

	Fitting ABPBTAT model...


100%|██████████| 30/30 [45:00<00:00, 90.02s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

	Fitting APBPTAT model...


100%|██████████| 30/30 [28:04<00:00, 56.16s/it]


              acc    recall        f1
kernel                               
AAT      1.000000  1.000000  1.000000
ABAT     0.997603  0.999275  0.998732
APAT     1.000000  1.000000  1.000000
ABPBTAT  1.000000  1.000000  1.000000
APBPTAT  0.988699  0.998187  0.994042
CPU times: user 8h 6min 28s, sys: 38min 55s, total: 8h 45min 24s
Wall time: 1h 21min 26s


From here, we can create the models we will use. Note that we included `"hindroid_params"` in the config file. Therefore we also fitted a Hindroid model on the data. We will also describe how to utilize that class though both models are largely the same.

In [27]:
m2vDroid = M2VDroid('data/out/train-half/',
                    classifier_args={'max_depth':3, 'n_jobs':-1})
m2vDroid.name

'train-half'

In [4]:
# also saves output table to a folder
m2vDroid.fit_predict('data/out/test-half/', 
                     walk_args={
                         "nprocs": 16,
                         "length": 60,
                         "n": 3,
                         "metapaths": [
                             ["app", "api", "app"],
                             ["app", "api", "method", "api", "app"],
                             ["app", "api", "package", "api", "app"],
                             ["app", "api", "package", "api", "method", "api", "app"],
                             ["app", "api", "method", "api", "package", "api", "app"]
                         ]
                     },
                    w2v_args={
                        "size": 128,
                        "window": 7,
                        "min_count": 0,
                        "negative": 5,
                        "sg": 1,
                        "workers": 16,
                        "iter": 5
                    })

Computing new edges
<stellargraph.core.graph.StellarGraph object at 0x7fc904946bb0>
Running random walk
Running Word2Vec
Fitting model


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


              precision    recall  f1-score   support

           0       1.00      0.73      0.85       162
           1       0.98      1.00      0.99      2758

    accuracy                           0.99      2920
   macro avg       0.99      0.87      0.92      2920
weighted avg       0.99      0.99      0.98      2920



Unnamed: 0_level_0,m2vDroid,true
app,Unnamed: 1_level_1,Unnamed: 2_level_1
00268453be254779f0c7590de47db944,1,1
002a7270ec52ec68ea3d979c85261308,1,1
0030e0003b7226e9142683e49b41a423,1,1
00335946abb79777f9fe2d0d96651e03,1,1
0038be31cfed95e13a33d87142eada70,1,1
...,...,...
org.edx.mobile,0,0
org.mozilla.firefox,0,0
org.videolan.vlc,0,0
pps.christmas.photo.frames,0,0


In [4]:
np.dot(hindroid.A, hindroid.B).dot(hindroid.P)

<2920x2822990 sparse matrix of type '<class 'numpy.float32'>'
	with 4424651554 stored elements in Compressed Sparse Row format>

In [2]:
hindroid = Hindroid('data/out/train-half/')
# %time hindroid.fit_predict('data/out/test-half/')
hindroid.fit_predict('data/out/test-sample/')

Computing unique APIs per app


Building A-test matrix: 100%|██████████| 6/6 [00:00<00:00, 83.69it/s]


Making predictions


Predicting AAT, batch:   0%|          | 0/1 [00:00<?, ?it/s]




Predicting AAT, batch: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]
Predicting ABAT, batch:   0%|          | 0/1 [00:00<?, ?it/s]




Predicting ABAT, batch: 100%|██████████| 1/1 [00:02<00:00,  2.23s/it]
Predicting APAT, batch:   0%|          | 0/1 [00:00<?, ?it/s]




Predicting APAT, batch: 100%|██████████| 1/1 [00:00<00:00,  1.52it/s]
Predicting ABPBTAT, batch:   0%|          | 0/1 [00:00<?, ?it/s]




Predicting ABPBTAT, batch: 100%|██████████| 1/1 [00:22<00:00, 22.30s/it]
Predicting APBPTAT, batch:   0%|          | 0/1 [00:00<?, ?it/s]




Predicting APBPTAT, batch: 100%|██████████| 1/1 [00:20<00:00, 20.06s/it]

AAT:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         5

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

ABAT:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         5

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

APAT:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         5

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

AB




Unnamed: 0_level_0,AAT,ABAT,APAT,ABPBTAT,APBPTAT,true
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
635a57a483cda858f78f14386e76aab4,1,1,1,1,1,1
912054e230f08f3747c2966d3f92944f,1,1,1,1,1,1
29b4865171cdfad2a6f011614a1a8038,1,1,1,1,1,1
2a92f33fa4b5af4e61d70eb15a28030d,1,1,1,1,1,1
35ecbda726e1e56467bf8b0e0dbe2c2a,1,1,1,1,1,1
com.sfeehha.bubble419,0,0,0,0,0,0


In [3]:
hindroid = Hindroid('data/out/train-half/')
%time hindroid.fit_predict('data/out/test-half/')
# hindroid.fit_predict('data/out/test-sample/')

Computing unique APIs per app


Building A-test matrix: 100%|██████████| 2920/2920 [00:48<00:00, 59.69it/s] 
Batch:   0%|          | 0/6 [00:00<?, ?it/s]

Making predictions
Predicting AAT


Batch: 100%|██████████| 6/6 [00:11<00:00,  1.96s/it]
Batch:   0%|          | 0/6 [00:00<?, ?it/s]

Predicting ABAT


Batch: 100%|██████████| 6/6 [09:40<00:00, 96.80s/it] 
Batch:   0%|          | 0/6 [00:00<?, ?it/s]

Predicting APAT


Batch: 100%|██████████| 6/6 [00:31<00:00,  5.33s/it]
Batch:   0%|          | 0/6 [00:00<?, ?it/s]

Predicting ABPBTAT


Batch: 100%|██████████| 6/6 [1:45:40<00:00, 1056.67s/it]
Batch:   0%|          | 0/6 [00:00<?, ?it/s]

Predicting APBPTAT


Batch: 100%|██████████| 6/6 [1:07:24<00:00, 674.15s/it]


AAT:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       162
           1       1.00      1.00      1.00      2758

    accuracy                           1.00      2920
   macro avg       0.99      0.97      0.98      2920
weighted avg       1.00      1.00      1.00      2920

ABAT:
              precision    recall  f1-score   support

           0       0.65      0.92      0.76       162
           1       1.00      0.97      0.98      2758

    accuracy                           0.97      2920
   macro avg       0.82      0.95      0.87      2920
weighted avg       0.98      0.97      0.97      2920

APAT:
              precision    recall  f1-score   support

           0       0.96      0.81      0.88       162
           1       0.99      1.00      0.99      2758

    accuracy                           0.99      2920
   macro avg       0.98      0.91      0.94      2920
weighted avg       0.99      0.99      0.99      2920

AB

Unnamed: 0_level_0,AAT,ABAT,APAT,ABPBTAT,APBPTAT,true
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00268453be254779f0c7590de47db944,1,1,1,1,1,1
002a7270ec52ec68ea3d979c85261308,1,1,1,1,1,1
0030e0003b7226e9142683e49b41a423,1,1,1,1,1,1
00335946abb79777f9fe2d0d96651e03,1,1,1,1,1,1
0038be31cfed95e13a33d87142eada70,1,1,1,1,1,1
...,...,...,...,...,...,...
org.edx.mobile,0,0,0,0,0,0
org.mozilla.firefox,0,0,1,0,0,0
org.videolan.vlc,0,0,0,0,0,0
pps.christmas.photo.frames,0,0,0,0,0,0


Now that the code has finished running, let us view the results.

In [2]:
hindroid = Hindroid('data/out/train-half/')
%time hindroid.fit_predict('data/out/all-apps/')
# hindroid.fit_predict('data/out/test-sample/')

Computing unique APIs per app


Building A-test matrix: 100%|██████████| 6072/6072 [02:09<00:00, 46.79it/s]  


Making predictions


Predicting AAT, batch:   0%|          | 0/13 [00:00<?, ?it/s]




Predicting AAT, batch: 100%|██████████| 13/13 [00:28<00:00,  2.21s/it]
Predicting ABAT, batch:   0%|          | 0/13 [00:00<?, ?it/s]




Predicting ABAT, batch: 100%|██████████| 13/13 [23:15<00:00, 107.38s/it]
Predicting APAT, batch:   0%|          | 0/13 [00:00<?, ?it/s]




Predicting APAT, batch: 100%|██████████| 13/13 [01:13<00:00,  5.69s/it]
Predicting ABPBTAT, batch:   0%|          | 0/13 [00:00<?, ?it/s]




Predicting ABPBTAT, batch: 100%|██████████| 13/13 [3:45:05<00:00, 1038.89s/it] 
Predicting APBPTAT, batch:   0%|          | 0/13 [00:00<?, ?it/s]




Predicting APBPTAT, batch: 100%|██████████| 13/13 [2:17:54<00:00, 636.46s/it] 


AAT:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92       556
           1       0.99      1.00      0.99      5516

    accuracy                           0.99      6072
   macro avg       0.99      0.93      0.96      6072
weighted avg       0.99      0.99      0.99      6072

ABAT:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       556
           1       0.99      0.99      0.99      5516

    accuracy                           0.98      6072
   macro avg       0.92      0.94      0.93      6072
weighted avg       0.98      0.98      0.98      6072

APAT:
              precision    recall  f1-score   support

           0       0.99      0.79      0.88       556
           1       0.98      1.00      0.99      5516

    accuracy                           0.98      6072
   macro avg       0.98      0.90      0.93      6072
weighted avg       0.98      0.98      0.98      6072

AB

Unnamed: 0_level_0,AAT,ABAT,APAT,ABPBTAT,APBPTAT,true
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
com.kaktus.hyungkaktus,0,0,0,0,0,0
com.wedup.duduamzaleg,0,1,0,0,0,0
com.dublin_mobile123.cheat_gta_5,0,0,0,0,0,0
com.appall.optimizationbox,1,0,1,0,1,0
live.wallpaper.t910001560,1,0,1,0,0,0
...,...,...,...,...,...,...
com.nytimes.android,0,0,0,0,0,0
com.tinytouchtales.alchi,0,0,0,0,0,0
com.mycelium.wallet,0,0,0,0,0,0
com.aceviral.smashycity,0,0,0,0,0,0


In [33]:
create_performance_table('data/out/test-half/m2v-train-half/predictions.csv', 
                         'data/out/test-half/train-half_HD_predictions.csv', 
                         'reports/assets/baseline_performance_chart.csv')

Unnamed: 0,ACC,TPR,F1,TP,TN,FP,FN
m2vDroid,0.985274,1.0,0.992265,2758,119,43,0
AAT,0.995548,0.99855,0.997645,2754,153,9,4
ABAT,0.968493,0.971356,0.983119,2679,149,13,79
APAT,0.988014,0.998187,0.993683,2753,132,30,5
ABPBTAT,0.995548,0.997099,0.997642,2750,157,5,8
APBPTAT,0.97089,0.974257,0.984429,2687,148,14,71


In [None]:
sparse.csr_matrix(features, dtype='i1')