# Project Summary

# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
# import data label
df_label = pd.read_csv("data/df_label.csv")
df_label.head()

Unnamed: 0,id,label
0,25478619,1.0
1,25871453,0.0
2,43982508,0.0
3,4452511,0.0
4,6271969,0.0


In [3]:
df_label.shape

(28959, 2)

In [4]:
df_label.describe()

# 18959 with labels. Can be used for train + test
# 10000 are heldout.

Unnamed: 0,id,label
count,28959.0,18959.0
mean,22614410.0,0.152012
std,13032510.0,0.359042
min,863.0,0.0
25%,11449670.0,0.0
50%,22660610.0,0.0
75%,33854150.0,0.0
max,45197180.0,1.0


# Feature Engineering

## Graph Feature Engineering

In [5]:
edge = pd.read_csv("data/dat_edge.txt", delimiter="\t")
edge.shape

(31255329, 3)

In [6]:
# I am limiting to 10K edges because of limited compute capacity. Ideally, we should use all edge information
edge = edge[:100000]
edge.head()

Unnamed: 0,from_id,to_id,info
0,10000019,23264041,2017-12:1_11
1,1000010,29753962,2017-12:1_27
2,10000189,15381095,2017-12:1_5
3,10000223,36347822,2017-11:1_24
4,1000023,17857485,2018-01:1_11


Note: the info column is a combination of three variables Date:num_weight. 
Next, I seperate them into different columns

In [7]:
# seperate the info column 
from_id = []
to_id = []
dates = []
nums = []
weights = []

edge['info'] = edge['info'].str.replace(':','_')

for i, row in edge.iterrows():
    for t in row.info.split(","):
        from_id.append(row.from_id)
        to_id.append(row.to_id)

        date, num, weight = t.split("_")

        dates.append(date)
        nums.append(num)
        weights.append(weight)

In [8]:
graph = pd.DataFrame({"from_id": from_id, "to_id": to_id, "date": dates, "num":nums, "weight":weights})
graph.to_csv("data/graph", index=False)
graph.head()

Unnamed: 0,from_id,to_id,date,num,weight
0,10000019,23264041,2017-12,1,11
1,1000010,29753962,2017-12,1,27
2,10000189,15381095,2017-12,1,5
3,10000223,36347822,2017-11,1,24
4,1000023,17857485,2018-01,1,11


In [9]:
# cast weight and num into int
graph['weight'] = graph['weight'].astype(int)
graph['num'] = graph['num'].astype(int)
graph.describe()

Unnamed: 0,from_id,to_id,num,weight
count,106673.0,106673.0,106673.0,106673.0
mean,9770217.0,22625700.0,1.629297,94.946088
std,3036584.0,13030020.0,3.44342,391.794497
min,10072.0,214.0,0.0,0.0
25%,10258880.0,11348670.0,1.0,15.0
50%,10707800.0,22663100.0,1.0,31.0
75%,11151340.0,33885020.0,1.0,72.0
max,11588490.0,45197140.0,206.0,35984.0


In [10]:
graph.weight = graph.weight + 1
graph.num = graph.num + 1
a = graph.groupby(["from_id", "to_id"]).weight.max()
a = a.reset_index()
a.head()

Unnamed: 0,from_id,to_id,weight
0,10072,4575965,10
1,10119,3615703,33
2,10149,13533889,19
3,10152,11902625,16
4,10152,29295999,56


### Node feature 1: embedding (deepwalk)

In [11]:
# prepare data for deepwalk
# ref: https://github.com/phanein/deepwalk
# format: edgelist 
# from_id to_id
# graph_for_emb.txt

with open("data/graph_for_emb.txt", "w") as f:
    for i, row in a.iterrows(): 
        #f.write("%d %d %d\n" % (row.from_id, row.to_id, row.weight))
        f.write("%d %d\n" % (row.from_id, row.to_id))

use deepwalk to generate embedding (terminal in anaconda environment)

```
deepwalk --format edgelist --input data/graph_for_emb.txt --number-walks 80 --representation-size 128 --walk-length 40 --window-size 10 --output data/deepwalk_128.emb
```

In [12]:
# save feature
graph_emb = pd.read_csv("data/deepwalk_128.emb", delimiter=" ", names=["id"] + ["dp_%d" % i for i in range(128)], skiprows=1)
graph_emb[graph_emb.id.isin(df_label.id)].to_csv("features/graph/deepwalk_128_filtered.emb", index=False)

### Node feature 2: pagerank

In [13]:
# use networkx to generate other features for node
import networkx as nx

In [14]:
graph = pd.read_csv("data/graph")
a = graph.groupby(["from_id", "to_id"]).weight.sum()
a = a.reset_index()
a.head()

Unnamed: 0,from_id,to_id,weight
0,10072,4575965,9
1,10119,3615703,32
2,10149,13533889,18
3,10152,11902625,15
4,10152,29295999,55


In [15]:
with open("data/graph_for_pagerank.txt", "w") as f:
    for i, row in a.iterrows(): 
        f.write("%d %d %d\n" % (row.from_id, row.to_id, row.weight))

In [16]:
# build graph for nx
G = nx.DiGraph()
with open("data/graph_for_pagerank.txt", "r") as f:
    for line in f:
        from_id, to_id, weight = line.strip().split()
        G.add_edge(int(from_id), int(to_id), weight=int(weight))

In [17]:
pr = nx.pagerank(G)

In [18]:
with open("data/edge/pagerank.pkl", "wb") as f:
    pickle.dump(pr, f)

### Node feature 3: centrality

In [19]:
dc = nx.algorithms.centrality.degree_centrality(G)

In [20]:
with open("data/edge/degree_centrality.pkl", "wb") as f:
    pickle.dump(dc, f)

### Node feature 4: hits (hub and authority)

In [21]:
h,a = nx.hits(G)

In [22]:
with open("data/edge/h.pkl", "wb") as f:
    pickle.dump(h, f)

with open("data/edge/a.pkl", "wb") as f:
    pickle.dump(a, f)

### Node feature 5: in degree, out degree
Lastly, I compute indegree and outdegree

In [23]:
# for graph, remove nodes not in the df_label
graph_filter = graph[graph.from_id.isin(df_label.id) | graph.to_id.isin(df_label.id)]
graph_filter.to_csv("data/graph_filter.csv", index=False)

In [24]:
graph_filter.date = pd.to_datetime(graph_filter.date)
graph_filter.head()

Unnamed: 0,from_id,to_id,date,num,weight
128,1000221,21681306,2017-11-01,2,34
198,10003349,20429275,2017-11-01,1,40
221,10003968,31945112,2017-12-01,3,266
222,10003968,7210426,2017-11-01,6,230
518,10008418,11675834,2017-11-01,4,370


In [25]:
# basic indegree and outdegree
out_degree = graph_filter.groupby("from_id").to_id.count().reset_index().rename(columns={"from_id":"id", "to_id": "out_degree"})
in_degree = graph_filter.groupby("to_id").from_id.count().reset_index().rename(columns={"to_id": "id", "from_id":"in_degree"})

In [26]:
# indegree and outdegree weighted by num
out_num = graph_filter.groupby("from_id").num.sum().reset_index().rename(columns={"from_id":"id", "num": "out_sum"})
in_num = graph_filter.groupby("to_id").num.sum().reset_index().rename(columns={"to_id": "id", "num":"in_sum"})

In [27]:
# indegree and outdegreee weighted by weight
in_weight = graph_filter.groupby("to_id").weight.sum().reset_index().rename(columns={"to_id": "id", "weight":"in_weight"})
out_weight = graph_filter.groupby("from_id").weight.sum().reset_index().rename(columns={"from_id":"id", "weight": "out_weight"})

In [28]:
# indegree and outdegree weighted by unique node
in_unique = graph_filter.groupby("to_id").from_id.nunique().reset_index().rename(columns={"to_id":"id", "from_id": "in_nunique"})
out_unique = graph_filter.groupby("from_id").to_id.nunique().reset_index().rename(columns={"from_id":"id", "to_id": "out_nunique"})

In [29]:
graph_info = df_label[['id']]
graph_info = graph_info.merge(out_degree, on="id")
graph_info = graph_info.merge(in_degree, on="id")

graph_info = graph_info.merge(out_num, on="id")
graph_info = graph_info.merge(in_num, on="id")

graph_info = graph_info.merge(out_weight, on="id")
graph_info = graph_info.merge(in_weight, on="id")

graph_info = graph_info.merge(out_unique, on="id")
graph_info = graph_info.merge(in_unique, on="id")

In [30]:
# build dict and get the number of connected node to a certain node
from_dict = {}
for i in df_label.id:
    from_dict[i] = set(graph_filter[graph_filter.from_id == i].to_id.values)
    
to_dict = {}
for i in df_label.id:
    to_dict[i] = set(graph_filter[graph_filter.to_id == i].from_id.values)
    
common_id = {}
for i in df_label.id:
    common_id[i] = from_dict[i] & to_dict[i]

graph_info['common_num'] = graph_info.id.apply(lambda x: len(common_id[x]))
graph_info.to_csv("features/graph/graph_info.csv", index=False)

## Categorical Features

Next, I do feature engineering for the three categorical features.

### Categorical feature 1: risk

Risk feature list five risk-related behaviors (a-e).

In [31]:
risk = pd.read_csv("data/dat_risk.txt", delimiter="\t")
risk.head()

Unnamed: 0,id,a_cnt,b_cnt,c_cnt,d_cnt,e_cnt
0,16,1,1,0,0,0
1,29,1,1,0,0,0
2,62,1,1,0,0,0
3,63,1,1,0,0,0
4,87,1,1,0,0,0


For this feature, I just tranform count into percentage, so that the pcts for each id add up to 1.

In [32]:
risk['total'] = risk[["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt"]].sum(axis=1)
for c in ["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt"]:
    risk[c[:1] + "_pct"] = risk[c] / risk.total
risk.head()

Unnamed: 0,id,a_cnt,b_cnt,c_cnt,d_cnt,e_cnt,total,a_pct,b_pct,c_pct,d_pct,e_pct
0,16,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
1,29,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
2,62,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
3,63,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
4,87,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0


In [33]:
risk.to_csv("features/risk/risk.csv", index=False)

### Categorical feature 2:  apps
This feature contains all the apps the borrower had installed on the phone.

In [34]:
app = pd.read_csv("data/dat_app.txt", delimiter="\t", header=None, names=["id", "app_list"])
app.head()

Unnamed: 0,id,app_list
0,155,"234884,404900,322191,353350,365633,372053,3580..."
1,295,"374989,224028,233710,43891,43861,245685,238780..."
2,390,"365633,247448,242120,11285,208393,199718,38411..."
3,665,"450490,62347,188342,444688,347009,416584,33362..."
4,725,"374989,367185,407398,442665,391809,414377,1296..."


In [35]:
app.shape

(2759440, 2)

In [36]:
# filter df to include only people in the dataset
app_filter = app[app.id.isin(df_label.id)]
app_filter.shape

(8716, 2)

**feature engineering based on app frequency**

In [37]:
# replace "," to " "
app_filter['app_list'] = app_filter.app_list.apply(lambda x: " ".join(x.split(",")))
app_filter.head()

Unnamed: 0,id,app_list
305,26565,375584 138567 183407 138591 393410 11273 19098...
615,52560,107292 101827 265646 317321 317709 302935 3136...
747,64275,420303 138626 164604 138535 138511 108512 1384...
1312,110280,102307 87089 309400 294831 260802 384111 44512...
1540,127910,105635 107292 104921 431282 69587 70701 165880...


In [38]:
# get app counts
from collections import Counter

app_dict = Counter()

for i, row in app_filter.iterrows():
    app_dict += Counter(row.app_list.split(" "))

In [39]:
# incorporate app_count data as feature

app_filter['app_num'] = app_filter.app_list.apply(lambda x: len(x.split(" ")))
app_filter["app_freq_sum"] = app_filter.app_list.apply(lambda x: sum([app_dict[i] for i in x.split(" ")]))
app_filter['app_freq_mean'] = app_filter.app_freq_sum / app_filter.app_num
app_filter['app_freq_max'] = app_filter.app_list.apply(lambda x: max([app_dict[i] for i in x.split(" ")]))
app_filter['app_freq_min'] = app_filter.app_list.apply(lambda x: min([app_dict[i] for i in x.split(" ")]))
app_filter['app_freq_median'] = app_filter.app_list.apply(lambda x: np.median([app_dict[i] for i in x.split(" ")]))
app_filter['app_freq_var'] = app_filter.app_list.apply(lambda x: np.var([app_dict[i] for i in x.split(" ")]))
app_filter.head()

Unnamed: 0,id,app_list,app_num,app_freq_sum,app_freq_mean,app_freq_max,app_freq_min,app_freq_median,app_freq_var
305,26565,375584 138567 183407 138591 393410 11273 19098...,262,788842,3010.847328,8680,1,2170.0,6060709.0
615,52560,107292 101827 265646 317321 317709 302935 3136...,215,592857,2757.474419,8680,1,1724.0,6976492.0
747,64275,420303 138626 164604 138535 138511 108512 1384...,232,783921,3378.969828,8680,1,2181.0,5992619.0
1312,110280,102307 87089 309400 294831 260802 384111 44512...,223,668667,2998.506726,8680,1,2146.0,6832176.0
1540,127910,105635 107292 104921 431282 69587 70701 165880...,213,765371,3593.29108,8680,1,2479.0,5795815.0


In [40]:
# save frequency data to feature

app_filter[["id", "app_num", "app_freq_sum", "app_freq_mean", "app_freq_median", "app_freq_var"]].to_csv("features/app/app_info.csv", index=False)

**feature engineering based on app clustering (PCA)**

In [41]:
# select popular apps (top 4000)

a = pd.DataFrame({"app":list(app_dict.keys()), "count": list(app_dict.values())})
vocab = a.sort_values("count", ascending=False).head(4000).app.tolist()
vectorizer = CountVectorizer(vocabulary=vocab)
vector = vectorizer.fit_transform(app_filter.app_list)

In [42]:
# use PCA to reduce dimension to 16 

dim = 16
pca = PCA(n_components=dim)
pca_res = pca.fit_transform(vector.toarray())
app_pca = pd.DataFrame(pca_res, columns=["pca_%d" % i for i in range(dim)])
app_pca["id"] = app_filter.id.values
app_pca.head()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14,pca_15,id
0,8.04505,-2.396375,-0.949143,0.754789,-3.644863,3.034809,1.907565,-0.663493,3.19602,-0.83648,0.080516,0.781275,0.590063,-1.087342,-0.524868,0.103102,26565
1,-6.598469,-6.141521,-3.209506,-0.759967,-1.609088,-0.041314,1.142994,0.529841,-0.267484,1.103027,-0.173972,-0.026799,-0.078157,-0.192175,0.297714,-0.794623,52560
2,8.279521,-2.004904,-1.274391,-0.254639,-1.615846,-0.821964,0.331524,-1.911739,0.685686,0.205524,-0.604758,-1.226829,-0.898609,2.133924,1.354326,-0.36485,64275
3,-2.114944,6.768632,-1.057747,-2.082287,3.471473,0.779111,3.434587,1.0976,0.620684,-1.594046,-0.008621,-0.329222,1.027825,0.914006,0.558195,-0.233642,110280
4,8.205612,-1.830258,-1.265317,-0.07825,-1.384911,-0.699451,0.102365,-2.134963,0.818034,0.148375,-0.742956,-1.385353,-1.099959,1.595822,1.389499,-1.249613,127910


In [43]:
app_pca.to_csv("features/app/app_pca_16.csv", index=False)

### Categorical feature 3:  type

This feature contains the type of company lends money. The pattern in the symbol columns is [L1type_L2type]
L1type contains large category.
L2type contains sub-category within the L1type.

In [44]:
dat_type = pd.read_csv("data/dat_symbol.txt", delimiter="\t")
dat_type.rename(columns = {"symbol":"type"}, inplace = True)
dat_type.head()

Unnamed: 0,id,type
0,2,其他公司类_其他
1,7,其他公司类_其他
2,40,其他公司类_其他
3,53,"互金公司_p2p,贷款类_其他"
4,60,其他公司类_其他


In [45]:
# Extract L1 type into
dat_type['cat_count'] = dat_type.type.apply(lambda x: len(x.split(","))) 
dat_type['l2'] = dat_type.type.apply(lambda x:" ".join([i for i in x.split(",")])) 
dat_type['l1'] = dat_type.type.apply(lambda x:" ".join([i.split("_")[0] for i in x.split(" ")]))
dat_type.head()

Unnamed: 0,id,type,cat_count,l2,l1
0,2,其他公司类_其他,1,其他公司类_其他,其他公司类
1,7,其他公司类_其他,1,其他公司类_其他,其他公司类
2,40,其他公司类_其他,1,其他公司类_其他,其他公司类
3,53,"互金公司_p2p,贷款类_其他",2,互金公司_p2p 贷款类_其他,互金公司
4,60,其他公司类_其他,1,其他公司类_其他,其他公司类


In [46]:
# use sklearn's CountVectorizer to generate dummies for L2 types
vectorizer = CountVectorizer()
a = vectorizer.fit_transform(dat_type.l2)
l2_type = pd.DataFrame(a.toarray(), columns=["l2_" + str(i) for i in range(44)])
l2_type.head()

Unnamed: 0,l2_0,l2_1,l2_2,l2_3,l2_4,l2_5,l2_6,l2_7,l2_8,l2_9,...,l2_34,l2_35,l2_36,l2_37,l2_38,l2_39,l2_40,l2_41,l2_42,l2_43
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# similarly, use CountVectorize to generate dummies for L1 types
b = vectorizer.fit_transform(dat_type.l1)
l1_type = pd.DataFrame(b.toarray(), columns=["l1_" + str(i) for i in range(24)])
l1_type.head()

Unnamed: 0,l1_0,l1_1,l1_2,l1_3,l1_4,l1_5,l1_6,l1_7,l1_8,l1_9,...,l1_14,l1_15,l1_16,l1_17,l1_18,l1_19,l1_20,l1_21,l1_22,l1_23
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
dat_type = dat_type.join(l1_type)
dat_type = dat_type.join(l2_type)
dat_type.drop(["type", "l1","l2"], axis=1, inplace=True)
dat_type.head()

Unnamed: 0,id,cat_count,l1_0,l1_1,l1_2,l1_3,l1_4,l1_5,l1_6,l1_7,...,l2_34,l2_35,l2_36,l2_37,l2_38,l2_39,l2_40,l2_41,l2_42,l2_43
0,2,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,40,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,53,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
dat_type.to_csv("features/type/type.csv", index=False)

## Feature Engineering based on 1-Degree Contacts Features

### helper function

Create a helper function to calculate 1-degree contact's features. Note that we can also extend to n-degree contacts, but the time complexity will be extremely large O(N-node**N-degree).

In [50]:
graph_filter.head()

Unnamed: 0,from_id,to_id,date,num,weight
128,1000221,21681306,2017-11-01,2,34
198,10003349,20429275,2017-11-01,1,40
221,10003968,31945112,2017-12-01,3,266
222,10003968,7210426,2017-11-01,6,230
518,10008418,11675834,2017-11-01,4,370


In [51]:
def feature_with_graph(graph_filter, other_df, feature_cols, to_dir, new_col_name, func, weight_type, ids=df_label.id):
    task_name = "%s%s_%s" % (new_col_name, weight_type, func)
    
    to_df = graph_filter.rename(columns={"to_id": "id"}).merge(other_df, on="id", how="left").drop("id", axis=1).rename(columns={"from_id":"id"})
    from_df = graph_filter.rename(columns={"from_id": "id"}).merge(other_df, on="id", how="left").drop("id", axis=1).rename(columns={"to_id":"id"})
    
    to_df = to_df.merge(to_df.groupby("id")["num", "weight"].sum().reset_index().rename(columns={"num":"num_sum_total", "weight":"weight_sum_total"}), on="id", how="left")
    from_df = from_df.merge(from_df.groupby("id")["num", "weight"].sum().reset_index().rename(columns={"num":"num_sum_total", "weight":"weight_sum_total"}), on="id", how="left")
    
    if weight_type == "_num":
        for f in feature_cols:
            to_df[f] = to_df[f] * to_df["num"]
            from_df[f] = from_df[f] * from_df["num"]
    elif weight_type == "_weight":
        for f in feature_cols:
            to_df[f] = to_df[f] * to_df["weight"]
            from_df[f] = from_df[f] * from_df["weight"]

    if weight_type in ["_num", "_weight"] and func == "mean":
        to_df[f] /= to_df[weight_type[1:] + "_sum_total"]
        from_df[f] /= from_df[weight_type[1:] + "_sum_total"]
        a = to_df.groupby("id")[feature_cols].agg("sum").reset_index()
        b = from_df.groupby("id")[feature_cols].agg("sum").reset_index()
    else:
        a = to_df.groupby("id")[feature_cols].agg(func).reset_index()
        b = from_df.groupby("id")[feature_cols].agg(func).reset_index()
    
    if new_col_name == "type":
        a['to_%s%s_count' % (new_col_name, weight_type)] = a[[c for c in a.columns if c != "id"]].sum(axis=1)
        b['from_%s%s_count' % (new_col_name, weight_type)] = b[[c for c in b.columns if c != "id"]].sum(axis=1)

    a.columns = ["id"] + ["to_%s%s_%s_%d" % (new_col_name, weight_type, func, i) for i in range(1, len(a.columns))]
    b.columns = ["id"] + ["from_%s%s_%s_%d" % (new_col_name, weight_type, func, i) for i in range(1, len(a.columns))]

    a[a.id.isin(ids)].to_csv("features/%s/to_%s%s_%s.csv" % (to_dir, new_col_name, weight_type, func), index=False)
    b[b.id.isin(ids)].to_csv("features/%s/from_%s%s_%s.csv" % (to_dir, new_col_name, weight_type, func), index=False)
    
    
    print(task_name + ' completed')

### 1-degree contact's non-graph features (risk, apps, type)

#### Risk

In [52]:
risk = pd.read_csv("features/risk/risk.csv")
risk_f = ["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt", "total"]
risk_pct_f = ["a_pct", "b_pct", "c_pct", "d_pct", "e_pct"]

In [53]:
risk_args_list = [
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", ""),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", "_num"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", "_weight"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", ""),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", "_num"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", "_weight"),
    (graph_filter, risk, risk_pct_f, "risk_graph", "risk_pct", "mean", ""),
    (graph_filter, risk, risk_pct_f, "risk_graph", "risk_pct", "mean", "_num"),
    (graph_filter, risk, risk_pct_f, "risk_graph", "risk_pct", "mean", "_weight")
]

In [54]:
for arg in risk_args_list:
    feature_with_graph(*arg)

risk_mean completed
risk_num_mean completed
risk_weight_mean completed
risk_sum completed
risk_num_sum completed
risk_weight_sum completed
risk_pct_mean completed
risk_pct_num_mean completed
risk_pct_weight_mean completed


#### Apps

In [55]:
app_pca = pd.read_csv("features/app/app_pca_16.csv")

In [56]:
app_args_list = [
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", ""),
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", "_num"),
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", "_weight")
]

In [57]:
for arg in app_args_list:
    feature_with_graph(*arg)

app_pca_mean completed
app_pca_num_mean completed
app_pca_weight_mean completed


#### Type

In [58]:
dat_type = pd.read_csv("features/type/type.csv")
lev_f = []
for f in dat_type.columns:
# here we are using only l1 level types
    if f[:2] == "l1":
        lev_f.append(f)

In [59]:
type_args_list = [
    (graph_filter, dat_type, lev_f, "type_graph", "type", "sum", ""),
    (graph_filter, dat_type, lev_f, "type_graph", "type", "sum", "_num"),
    (graph_filter, dat_type, lev_f, "type_graph", "type", "sum", "_weight"),
    (graph_filter, dat_type, lev_f, "type_graph", "type", "mean", ""),
    (graph_filter, dat_type, lev_f, "type_graph", "type", "mean", "_num"),
    (graph_filter, dat_type, lev_f, "type_graph", "type", "mean", "_weight"),
]

In [60]:
for arg in type_args_list:
    feature_with_graph(*arg)

type_sum completed
type_num_sum completed
type_weight_sum completed
type_mean completed
type_num_mean completed
type_weight_mean completed


### 1-degree contact's graph features 

#### pagerank

In [61]:
graph = pd.read_csv("data/graph")

graph_filter_ids = set(graph_filter.to_id.tolist()) | set(graph_filter.from_id.tolist())

In [62]:
with open("data/edge/pagerank.pkl", "rb") as f:
    pr = pickle.load(f)

pr_df = pd.DataFrame({"id": list(pr.keys()), "pr":list(pr.values())})
pr_df[pr_df.id.isin(df_label.id)].to_csv("features/graph/pagerank.csv", index=False)

In [63]:
pr_f = ['pr']
pr_args_list = [
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "", ),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "_weight", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "_num", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "_weight", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "_num", graph_filter_ids),
]

In [64]:
for arg in pr_args_list:
    feature_with_graph(*arg)

pagerank_sum completed
pagerank_weight_sum completed
pagerank_num_sum completed
pagerank_mean completed
pagerank_weight_mean completed
pagerank_num_mean completed


#### centrality

In [65]:
with open("data/edge/degree_centrality.pkl", "rb") as f:
    a = pickle.load(f)

a_df = pd.DataFrame({"id": list(a.keys()), "a":list(a.values())})
a_df[a_df.id.isin(df_label.id)].to_csv("features/graph/dc.csv", index=False)

In [66]:
f = ['a']
dc_args_list = [
    (graph_filter, a_df, f, "graph", "dc", "sum", ""),
    (graph_filter, a_df, f, "graph", "dc", "sum", "_weight"),
    (graph_filter, a_df, f, "graph", "dc", "sum", "_num"),
    (graph_filter, a_df, f, "graph", "dc", "mean", ""),
    (graph_filter, a_df, f, "graph", "dc", "mean", "_weight"),
    (graph_filter, a_df, f, "graph", "dc", "mean", "_num"),
]

In [67]:
for arg in dc_args_list:
    feature_with_graph(*arg)

dc_sum completed
dc_weight_sum completed
dc_num_sum completed
dc_mean completed
dc_weight_mean completed
dc_num_mean completed


#### hits

In [68]:
with open("data/edge/a.pkl", "rb") as f:
    a = pickle.load(f)

a_df = pd.DataFrame({"id": list(a.keys()), "a":list(a.values())})

with open("data/edge/h.pkl", "rb") as f:
    h = pickle.load(f)

h_df = pd.DataFrame({"id": list(a.keys()), "h":list(h.values())})


hits = a_df.merge(h_df, on="id")
hits[a_df.id.isin(df_label.id)].to_csv("features/graph/hits.csv", index=False)

In [69]:
hits.head()

Unnamed: 0,id,a,h
0,10072,5.088252e-241,0.0
1,4575965,0.0,0.0
2,10119,0.0,0.0
3,3615703,0.0,0.0
4,10149,0.0,0.0


In [70]:
hits_f = ['a', "h"]
hits_args_list = [
    (graph_filter, hits, hits_f, "graph", "hits", "sum", ""),
    (graph_filter, hits, hits_f, "graph", "hits", "sum", "_weight"),
    (graph_filter, hits, hits_f, "graph", "hits", "sum", "_num"),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", ""),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", "_weight"),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", "_num"),
]

In [71]:
for arg in hits_args_list:
    feature_with_graph(*arg)

hits_sum completed
hits_weight_sum completed
hits_num_sum completed
hits_mean completed
hits_weight_mean completed
hits_num_mean completed
