In [1]:
from dask.distributed import Client, LocalCluster, performance_report
import dask.dataframe as dd
import os
import pandas as pd
import numpy as np
import pickle
import json
from stellargraph import StellarGraph, IndexedArray
from stellargraph.data import UniformRandomMetaPathWalk
from gensim.models import Word2Vec
from p_tqdm import p_umap, p_imap, p_map
from tqdm import tqdm
from scipy import sparse
from itertools import combinations, product
from functools import partial
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.utils.extmath import cartesian
import csv

In [2]:
os.chdir('/home/rcgonzal/DSC180Malware/m2v-adversarial-hindroid/')
!pwd

/home/rcgonzal/DSC180Malware/m2v-adversarial-hindroid


In [3]:
from src.data.hindroid import *

%load_ext autoreload
%autoreload 2

In [4]:
outfolder = "data/out/miniset-train"

In [19]:
edges_path = os.path.join(outfolder, 'edges.csv')
app_map_path = os.path.join(outfolder, 'app_map.csv')
api_map_path = os.path.join(outfolder, 'api_map.csv')

apis = pd.read_csv(api_map_path, index_col='api').uid.str.replace('api', '').astype(int).values
apps = pd.read_csv(app_map_path, index_col='app').uid.str.replace('app', '').astype(int).values

edges = dd.read_csv(edges_path, dtype=str)
edges['target'] = edges.target.str.replace('api', '').astype(int)
app_api_edges = edges[edges.source.str.startswith('app')]

num_apps = apps.size
num_apis = apis.size

# A matrix
print("Constructing A matrix...")
app_api_edges['source'] = app_api_edges.source.str.replace('app', '').astype(int)

mlb = MultiLabelBinarizer(sparse_output=True)
mlb.fit([(api,) for api in apis])

A_mat = mlb.transform(app_api_edges.groupby('source').target.unique().compute().sort_index())
sparse.save_npz(os.path.join(outfolder, 'hindroid', 'A_mat.npz'), A_mat)

FileNotFoundError: [Errno 2] No such file or directory: 'data/out/miniset-train/api_map.csv'

In [20]:
# B Matrix
print("Constructing B matrix...")
api_method_edges = edges[edges.source.str.startswith('method')]
api_sets = api_method_edges.groupby('source').target.unique()
B_edges = list(pd.Series(apis).apply(lambda x: (x,x)))

for combos in p_imap(partial(combinations, r=2), api_sets.compute()):
    B_edges.extend(combos)

B_mat = mlb.transform(pd.DataFrame(B_edges).groupby(0)[1].unique().sort_index())
B_mat

Constructing B matrix...


  0%|          | 0/2843658 [00:00<?, ?it/s]

<2547149x2547149 sparse matrix of type '<class 'numpy.int64'>'
	with 49501952 stored elements in Compressed Sparse Row format>

In [5]:
for api_list in tqdm(api_sets.compute()):
    P_edges.update(prep_edges(api_list))

P_mat = mlb.transform(pd.DataFrame(P_edges).groupby(0)[1].unique().sort_index())
P_mat

NameError: name 'api_sets' is not defined

In [10]:
%time build_matrices('data/out/test-sample')

Dask Cluster: LocalCluster('tcp://127.0.0.1:42591', workers=4, threads=8, memory=68.72 GB)
Dashboard port: 8787


100%|██████████| 16790/16790 [00:00<00:00, 300520.04it/s]

Constructing A matrix...
Constructing B matrix...



 52%|█████▏    | 150/286 [00:00<00:00, 1274.99it/s]

Constructing P matrix...


100%|██████████| 286/286 [00:00<00:00, 781.79it/s] 


CPU times: user 6.4 s, sys: 768 ms, total: 7.17 s
Wall time: 13.9 s


In [None]:
%time build_matrices(outfolder)

Dask Cluster: LocalCluster('tcp://127.0.0.1:46870', workers=4, threads=8, memory=68.72 GB)
Dashboard port: 8787
Constructing A matrix...
Constructing B matrix...


100%|██████████| 2843658/2843658 [00:11<00:00, 245104.59it/s]


Constructing P matrix...


 11%|█         | 27468/246035 [01:01<04:58, 732.23it/s] 

In [6]:
api_map_path = os.path.join(outfolder, 'api_map.csv')
apis = pd.read_csv(api_map_path, index_col='api').uid.str.replace('api', '').astype(int).values

In [13]:
def parse_P_edges(api_list, P_edges_file):
    P_edges_writer = csv.writer(P_edges_file)
    if len(api_list) >= 15000: # if combinations too large for memory
        temp_edges = []
        for idx1 in api_list:
            for idx2 in api_list:
                temp_edges.append((idx1, idx2))
                if len(temp_edges) > 50000:
                    P_edges_writer.writerows(temp_edges)
                    temp_edges = []
        P_edges_writer.writerows(temp_edges)
    else:
        P_edges_writer.writerows(combinations(api_list, r=2))

In [None]:
P_edges_filepath = os.path.join('data', 'temp', 'P_edges.csv')

pd.DataFrame([(api, api) for api in apis], columns=['source', 'target']).to_csv(P_edges_filepath, index=False)

with open(P_edges_filepath, 'a') as P_edges_file:
    p_umap(partial(parse_P_edges, P_edges_file=P_edges_file), pd.read_pickle('data/temp/package_api_sets.pkl'))
#     for api_list in tqdm():
#         parse_P_edges(api_list, P_edges_writer)

  0%|          | 0/246035 [00:00<?, ?it/s]

In [8]:
temp_edges = list(pd.Series(apis).apply(lambda x: (x,x)))
#     for combos in p_imap(prep_edges, pd.read_pickle('data/temp/package_api_sets.pkl')):
#         temp_edges.extend(combos)
for package, api_list in tqdm(pd.read_pickle('data/temp/package_api_sets.pkl').items()):
    if len(api_list) >= 15000: # if combinations too large for memory
        print(package, len(api_list))
        for idx1 in api_list:
            for idx2 in api_list:
                temp_edges.append((idx1, idx2))
    else:
        temp_edges.extend(combinations(api_list, r=2))
P_mat = mlb.transform(pd.DataFrame(temp_edges).groupby(0)[1].unique().sort_index())

27584it [00:32, 976.61it/s]  

package1269 33490


29891it [01:46, 280.88it/s]


KeyboardInterrupt: 

In [11]:
temp_edges = None

In [9]:
import sys
sys.getsizeof(temp_edges)

6540755792

In [10]:
import sys
sys.getsizeof((1,2))

56

In [12]:
package_api_sets = pd.read_pickle('data/temp/package_api_sets.pkl')
pkg_lens = package_api_sets.apply(len)
pkg_lens[pkg_lens>1000]

source
package0          1716
package101        3951
package102        1201
package1029      12128
package103        1314
                 ...  
package227284     7222
package227326     2003
package227601     1189
package227630     2287
package245813     1474
Name: target, Length: 254, dtype: int64

In [27]:
package_api_sets.items()

<zip at 0x7f274c9b51c0>

In [None]:
sys.getsizeof(list(combinations((i for i in range(174_318)), r=2)))

In [22]:
pkg_lens[pkg_lens>10000].index

source
package1029       12128
package119        14488
package1269       33490
package1294       25454
package133        10883
package139        17335
package141621     38315
package227134     27881
package2276       11418
package2363      106893
package251       174318
package2816       18863
package286        10675
package338        10943
package580        16923
Name: target, dtype: int64

In [23]:
pd.read_csv(os.path.join(outfolder, 'package_map.csv'), index_col='uid').package[pkg_lens[pkg_lens>10000].index]

source
package1029                              Lcom/google/protobuf
package119                         Landroidx/appcompat/widget
package1269      Lcom/google/android/gms/internal/measurement
package1294      Lcom/google/android/gms/measurement/internal
package133                             Landroidx/fragment/app
package139                      Landroidx/recyclerview/widget
package141621                                              LX
package227134                                            Lcal
package2276                             Lcom/vungle/publisher
package2363                  Lcom/google/android/gms/internal
package251               Lcom/google/android/gms/internal/ads
package2816                                                Lo
package286        Lcom/google/android/gms/common/api/internal
package338             Lcom/google/android/gms/games/internal
package580                         Landroid/support/v7/widget
Name: package, dtype: object

In [None]:
%time make_models(outfolder)

In [45]:
pd.read_csv(os.path.join(outfolder, 'app_list.csv'))

Unnamed: 0,app,app_dir,category
0,635a57a483cda858f78f14386e76aab4,/teams/DSC180A_FA20_A00/a04malware/malware/Ban...,malware
1,912054e230f08f3747c2966d3f92944f,/teams/DSC180A_FA20_A00/a04malware/malware/You...,malware
2,29b4865171cdfad2a6f011614a1a8038,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
3,2a92f33fa4b5af4e61d70eb15a28030d,/teams/DSC180A_FA20_A00/a04malware/malware/You...,malware
4,35ecbda726e1e56467bf8b0e0dbe2c2a,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
5,com.sfeehha.bubble419,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps


In [33]:
popular_apps = pd.read_csv('data/out/all-apps/app_list.csv', dtype=str)
popular_apps = popular_apps[popular_apps.category=='popular-apps']
popular_apps.sample(1)

Unnamed: 0,app,app_dir,category,malware
5871,com.sega.PhantasyStarII,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps,0


In [25]:
apps = (
    pd.read_csv(
        os.path.join(outfolder, 'app_map.csv'),
        dtype=str,
        index_col='app'
    ).join(
        pd.read_csv(
            os.path.join(outfolder, 'app_list.csv'),
            dtype=str,
            index_col='app'
        )
    )
)
apps = apps.reset_index().set_index(apps.uid.str.replace('app', '').astype(int)).sort_index()
apps

Unnamed: 0_level_0,app,uid,app_dir,category
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,635a57a483cda858f78f14386e76aab4,app0,/teams/DSC180A_FA20_A00/a04malware/malware/Ban...,malware
1,912054e230f08f3747c2966d3f92944f,app1,/teams/DSC180A_FA20_A00/a04malware/malware/You...,malware
2,29b4865171cdfad2a6f011614a1a8038,app2,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
3,2a92f33fa4b5af4e61d70eb15a28030d,app3,/teams/DSC180A_FA20_A00/a04malware/malware/You...,malware
4,35ecbda726e1e56467bf8b0e0dbe2c2a,app4,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware


In [3]:
edges_path = os.path.join(outfolder, 'edges.csv')
api_map_path = os.path.join(outfolder, 'api_map.csv')
app_map_path = os.path.join(outfolder, 'app_map.csv')
apis = pd.read_csv(api_map_path, index_col='api').uid.str.replace('api', '').astype(int).values
apps = pd.read_csv(app_map_path, index_col='app').uid.str.replace('app', '').astype(int).values
edges_path

'../data/out/test-sample/edges.csv'

In [4]:
edges = dd.read_csv(edges_path, dtype=str)
edges['target'] = edges.target.str.replace('api', '').astype(int)
app_api_edges = edges[edges.source.str.startswith('app')]

num_apps = apps.size
num_apis = apis.size

app_api_edges['source'] = app_api_edges.source.str.replace('app', '').astype(int)

In [31]:
app_api_edges.groupby('source').target.unique().compute()
mlb = MultiLabelBinarizer(sparse_output=True)
mlb.fit([(api,) for api in apis])
A_mat = mlb.transform(app_api_edges.groupby('source').target.unique().compute().sort_index())
A_mat

<5x15875 sparse matrix of type '<class 'numpy.int64'>'
	with 18900 stored elements in Compressed Sparse Row format>

In [29]:
api_method_edges = edges[edges.source.str.startswith('method')]
api_sets = api_method_edges.groupby('source').target.unique()
B_edges = set(pd.Series(apis).apply(lambda x: (x,x)))

for combos in p_imap(partial(combinations, r=2), api_sets.compute()):
    B_edges.update(combos)
    B_edges.update([(combo[1], combo[0]) for combo in combos])
    
B_mat = mlb.transform(pd.DataFrame(B_edges).groupby(0)[1].unique().sort_index())
B_mat

  0%|          | 0/13958 [00:00<?, ?it/s]

<15875x15875 sparse matrix of type '<class 'numpy.int64'>'
	with 204206 stored elements in Compressed Sparse Row format>

In [27]:
api_package_edges = edges[edges.source.str.startswith('package')]
api_sets = api_package_edges.groupby('source').target.unique()
P_edges = set(pd.Series(apis).apply(lambda x: (x,x)))

P_edges.update([(api, api) for api in apis])

for combos in p_imap(partial(combinations, r=2), api_sets.compute()):
    P_edges.update(combos)

P_mat = mlb.transform(pd.DataFrame(P_edges).groupby(0)[1].unique().sort_index())
P_mat

  0%|          | 0/250 [00:00<?, ?it/s]

<15875x15875 sparse matrix of type '<class 'numpy.int64'>'
	with 2800987 stored elements in Compressed Sparse Row format>