In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score
import pickle
from stellargraph import StellarGraph, IndexedArray
from stellargraph.data import UniformRandomMetaPathWalk
from p_tqdm import p_umap
from gensim.models import Word2Vec

In [2]:
import os
os.chdir('..')
!pwd

/datasets/home/home-00/10/410/rcgonzal/DSC180Malware/m2v-adversarial-hindroid


In [3]:
miniset_folder = os.path.join('data', 'out', 'miniset')

In [4]:
# create/load mini dataset
# all_apps = pd.read_csv('data/out/all-apps/all_apps.csv')
# popular_apps = all_apps[all_apps.category=='popular-apps']
# malware_sample = all_apps[all_apps.category=='malware'].sample(popular_apps.shape[0])
# miniset = pd.concat([popular_apps, malware_sample])
# miniset.to_csv(os.path.join(miniset_folder, "app_list.csv"), index=False)
miniset = pd.read_csv(os.path.join(miniset_folder, "app_list.csv"), index_col='app')
miniset

Unnamed: 0_level_0,app_dir,category
app,Unnamed: 1_level_1,Unnamed: 2_level_1
net.updategames.granny,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.gameloft.android.ANMP.GloftA8HM,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.devsisters.gb,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.huobi.cn,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.gretech.gomplayerko,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
...,...,...
11d425602d3c8311d1e18df35db1daa3,/teams/DSC180A_FA20_A00/a04malware/malware/RuM...,malware
864c4f8c355949226dad9b05ce530aad,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
83e8790a3318d76e3eb8150b4a235205,/teams/DSC180A_FA20_A00/a04malware/malware/Min...,malware
627603dd166e05c2cd500f71ab49609e,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware


After running the ETL and generating features for this dataset, measure the baseline performance.

In [5]:
baseline_features = pd.read_csv(os.path.join(miniset_folder, "features.csv"), index_col='app')
baseline_features['label'] = (miniset.category=='malware').astype(int)
X, y = baseline_features.loc[:,'0':'127'], baseline_features.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [6]:
model = RandomForestClassifier(n_estimators=50, max_depth=3,n_jobs=-1)  # probably overfit
model.fit(X_train, y_train)

print(classification_report(model.predict(X_train), y_train))
print(classification_report(model.predict(X_test), y_test))
f1_score(model.predict(X_test), y_test)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       142
           1       1.00      0.96      0.98       182

    accuracy                           0.98       324
   macro avg       0.98      0.98      0.98       324
weighted avg       0.98      0.98      0.98       324

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       169
           1       0.97      0.94      0.95       155

    accuracy                           0.96       324
   macro avg       0.96      0.96      0.96       324
weighted avg       0.96      0.96      0.96       324



0.9539473684210527

In [7]:
app_map = pd.read_csv(os.path.join(miniset_folder, "app_map.csv"), index_col='uid')
app_map['malware'] = app_map.app.map(miniset.category=='malware')
app_map

Unnamed: 0_level_0,app,malware
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
app0,net.updategames.granny,False
app1,com.gameloft.android.ANMP.GloftA8HM,False
app2,com.devsisters.gb,False
app3,com.huobi.cn,False
app4,com.gretech.gomplayerko,False
...,...,...
app643,11d425602d3c8311d1e18df35db1daa3,True
app644,864c4f8c355949226dad9b05ce530aad,True
app645,83e8790a3318d76e3eb8150b4a235205,True
app646,627603dd166e05c2cd500f71ab49609e,True


In [8]:
edges = pd.read_csv(os.path.join(miniset_folder, "edges.csv")).drop_duplicates()
edges

Unnamed: 0,source,target
0,app0,api0
1,app0,api1
2,app0,api2
3,app0,api3
4,app0,api4
...,...,...
29509951,package342559,api3312270
29509952,package342559,api3312271
29509953,package342559,api3312272
29509954,package342559,api3312273


In [9]:
app_api_edges = edges[edges.source.str.startswith('app')]#.set_index('source')
app_api_edges

Unnamed: 0,source,target
0,app0,api0
1,app0,api1
2,app0,api2
3,app0,api3
4,app0,api4
...,...,...
11506261,app647,api3135329
11506262,app647,api3135330
11506263,app647,api1888
11506264,app647,api3135336


In [10]:
# make pool of apis only used by benign apps
malignant_app_apis = app_api_edges[app_api_edges.source.map(app_map.malware)]
benign_apis = app_api_edges.drop(malignant_app_apis.index).target.unique()
print(benign_apis.size)
benign_apis

3127432


array(['api0', 'api1', 'api2', ..., 'api3127429', 'api3127430',
       'api3127431'], dtype=object)

In [11]:
num_samples = 10000
new_edges = []
for app_uid in baseline_features.uid[X_test.index]: # for each malignant app
    new_edges.append(
        pd.DataFrame().assign(
            source=[app_uid]*num_samples,
            target=pd.Series(benign_apis).sample(num_samples).reset_index(drop=True)
        )
    )
new_edges= pd.concat([
    edges, 
    pd.concat(new_edges, ignore_index=True)
], ignore_index=True)
new_edges

Unnamed: 0,source,target
0,app0,api0
1,app0,api1
2,app0,api2
3,app0,api3
4,app0,api4
...,...,...
32749951,app619,api2474052
32749952,app619,api1892200
32749953,app619,api1902628
32749954,app619,api690136


In [12]:
with open(os.path.join(miniset_folder,'graph.pkl'), 'rb') as file:
    g = pickle.load(file)
with open(os.path.join(miniset_folder,'graph.pkl'), 'wb') as file:
    pickle.dump(g, file)

In [13]:
nodes = {}
for node_type in ['app', 'api', 'method', 'package']:
    nodes[node_type] = IndexedArray(index=g.nodes()[g.nodes().str.startswith(node_type)])

In [14]:
g = StellarGraph(nodes, new_edges.drop_duplicates())

In [52]:
walk_args = {
    "nprocs": 8,
    "length": 60,
    "n": 3,
    "metapaths": [
        ["app", "api", "app"],
#         ["app", "api", "method", "api", "app"],
#         ["app", "api", "package", "api", "app"],
#         ["app", "api", "package", "api", "method", "api", "app"],
#         ["app", "api", "method", "api", "package", "api", "app"]
    ]
}
w2v_args = {
    "size": 128,
    "window": 5,
    "min_count": 0,
    "sg": 1,
    "workers": 16,
    "iter": 5
}

In [53]:
# random walk on all apps, save to metapath_walk.json
print('Performing random walks')
rw = UniformRandomMetaPathWalk(g)
app_nodes = list(g.nodes()[g.nodes().str.contains('app')])

def run_walks(metapath):
    return rw.run(app_nodes, n=1, length=walk_args['length'], metapaths=[metapath])

metapaths = [walk_args['metapaths'][i%len(walk_args['metapaths'])] for i in range(len(walk_args['metapaths'])*walk_args['n'])]
metapath_walks = np.concatenate(p_umap(run_walks, metapaths, num_cpus=walk_args['nprocs'])).tolist()
# with open(metapath_walk_outpath, 'w') as file:
#     json.dump(metapath_walks, file)

Performing random walks


  0%|          | 0/3 [00:00<?, ?it/s]

In [54]:
print('Running Word2vec')
w2v = Word2Vec(metapath_walks, **w2v_args)

features = pd.DataFrame(w2v.wv.vectors)
features['uid'] = w2v.wv.index2word
features['app'] = features['uid'].map(
    pd.read_csv(os.path.join(miniset_folder, 'app_map.csv'), index_col='uid').app
)
features = features[features.uid.str.contains('app')].set_index('app')
features

Running Word2vec


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,uid
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
com.vimage.android,-0.434181,0.904495,0.221625,0.453950,-0.037660,0.251947,0.213413,-0.265110,0.166257,0.080125,...,0.401366,0.094421,-0.019879,0.490105,0.411732,0.133142,-0.438637,1.482454,-0.241698,app104
com.zplayworld.popstar,-0.321782,0.658708,0.217133,0.457418,-0.384150,0.581817,0.313925,-0.821387,0.151057,0.560012,...,0.073903,0.208779,-0.354540,0.272968,0.624319,0.039289,0.012747,1.377997,-0.108074,app22
com.lulu.lulubox,-0.304102,0.723962,-0.040271,0.447959,-0.288282,0.596330,0.488771,-0.655010,0.148430,0.552671,...,0.030009,0.225452,-0.359683,0.336802,0.362210,0.038432,0.030904,1.189019,0.150337,app58
com.marker.samsara,-0.254549,0.684685,0.120190,0.418753,-0.261609,0.399931,0.135643,-0.614885,0.026061,0.407189,...,0.230456,0.154173,-0.579432,0.213709,0.793994,0.337976,-0.218079,1.438636,-0.101036,app247
com.qihoo.security,-0.200034,0.678715,0.159953,0.387708,-0.260823,0.499018,0.474530,-0.798367,0.163531,0.400919,...,0.094478,0.213551,-0.367226,0.336674,0.685901,0.318732,0.006208,1.278291,0.033621,app70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6d5f69facb4fb030fdc4402e80ec8bf4,-0.010568,0.023965,0.004698,0.013112,-0.013593,0.019785,0.012339,-0.027680,0.005966,0.011774,...,0.004195,0.015259,-0.010092,0.009737,0.024783,0.006813,0.002975,0.055303,0.000753,app383
947a81ff5fc75bc5d3120fcb2560b44a,-0.009337,0.032027,0.009047,0.021131,-0.012736,0.020324,0.012839,-0.026645,0.003341,0.014903,...,0.006778,0.015127,-0.014425,0.017712,0.027035,0.008259,-0.003955,0.069002,-0.002421,app420
b192c1bee97c5270f734bc59efad7cc4,-0.007492,0.026591,0.010206,0.014645,-0.017613,0.022259,0.020368,-0.029604,0.007487,0.012823,...,0.004809,0.011810,-0.011709,0.020079,0.030915,0.012302,0.003673,0.064613,-0.000503,app434
530f503ab61fcf1e83bc7c99aaf792d6,-0.015314,0.027821,0.004038,0.017082,-0.015717,0.021089,0.017045,-0.029893,0.005217,0.018604,...,0.008630,0.019734,-0.015124,0.010481,0.028630,0.011482,-0.005701,0.061374,0.001228,app436


In [55]:
features['label'] = (miniset.category=='malware').astype(int)
features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,uid,label
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
com.vimage.android,-0.434181,0.904495,0.221625,0.453950,-0.037660,0.251947,0.213413,-0.265110,0.166257,0.080125,...,0.094421,-0.019879,0.490105,0.411732,0.133142,-0.438637,1.482454,-0.241698,app104,0
com.zplayworld.popstar,-0.321782,0.658708,0.217133,0.457418,-0.384150,0.581817,0.313925,-0.821387,0.151057,0.560012,...,0.208779,-0.354540,0.272968,0.624319,0.039289,0.012747,1.377997,-0.108074,app22,0
com.lulu.lulubox,-0.304102,0.723962,-0.040271,0.447959,-0.288282,0.596330,0.488771,-0.655010,0.148430,0.552671,...,0.225452,-0.359683,0.336802,0.362210,0.038432,0.030904,1.189019,0.150337,app58,0
com.marker.samsara,-0.254549,0.684685,0.120190,0.418753,-0.261609,0.399931,0.135643,-0.614885,0.026061,0.407189,...,0.154173,-0.579432,0.213709,0.793994,0.337976,-0.218079,1.438636,-0.101036,app247,0
com.qihoo.security,-0.200034,0.678715,0.159953,0.387708,-0.260823,0.499018,0.474530,-0.798367,0.163531,0.400919,...,0.213551,-0.367226,0.336674,0.685901,0.318732,0.006208,1.278291,0.033621,app70,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6d5f69facb4fb030fdc4402e80ec8bf4,-0.010568,0.023965,0.004698,0.013112,-0.013593,0.019785,0.012339,-0.027680,0.005966,0.011774,...,0.015259,-0.010092,0.009737,0.024783,0.006813,0.002975,0.055303,0.000753,app383,1
947a81ff5fc75bc5d3120fcb2560b44a,-0.009337,0.032027,0.009047,0.021131,-0.012736,0.020324,0.012839,-0.026645,0.003341,0.014903,...,0.015127,-0.014425,0.017712,0.027035,0.008259,-0.003955,0.069002,-0.002421,app420,1
b192c1bee97c5270f734bc59efad7cc4,-0.007492,0.026591,0.010206,0.014645,-0.017613,0.022259,0.020368,-0.029604,0.007487,0.012823,...,0.011810,-0.011709,0.020079,0.030915,0.012302,0.003673,0.064613,-0.000503,app434,1
530f503ab61fcf1e83bc7c99aaf792d6,-0.015314,0.027821,0.004038,0.017082,-0.015717,0.021089,0.017045,-0.029893,0.005217,0.018604,...,0.019734,-0.015124,0.010481,0.028630,0.011482,-0.005701,0.061374,0.001228,app436,1


In [56]:
ptbd_X, ptbd_y = features.loc[:,0:127], features.label

In [61]:
ptbd_X_train, ptbd_y_train = ptbd_X.loc[X_train.index], ptbd_y.loc[X_train.index]
ptbd_X_test, ptbd_y_test = ptbd_X.loc[X_test.index], ptbd_y.loc[X_test.index]

In [63]:
model = RandomForestClassifier(n_estimators=50, max_depth=2, n_jobs=-1)  # probably overfit
model.fit(ptbd_X_train, ptbd_y_train) # train on normal data

print(classification_report(model.predict(ptbd_X_train), ptbd_y_train)) # performance with 
print(classification_report(model.predict(ptbd_X_test), ptbd_y_test))
f1_score(model.predict(ptbd_X_test), ptbd_y_test)

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       140
           1       0.99      0.94      0.96       184

    accuracy                           0.96       324
   macro avg       0.96      0.96      0.96       324
weighted avg       0.96      0.96      0.96       324

              precision    recall  f1-score   support

           0       0.98      0.70      0.82       243
           1       0.52      0.95      0.67        81

    accuracy                           0.77       324
   macro avg       0.75      0.83      0.74       324
weighted avg       0.86      0.77      0.78       324



0.6695652173913044