Cross Validation 3

size =  121655

test size 4k+

epoch-5

apply sigmoid (prediction probabilities are logits)

groups-9

target_list = ['Camera', 'Location', 'Microphone', 'Contacts', 'Storage', 'Phone', 'SMS', 'Call_Log', 'Calendar']

threshold-tuning = yes

df_2k = df[(df['Rating'] >= 4.0) & (df['Maximum_Installs'] >= 5000)]

accuracy score : F1 score (micro) and ROC_AUC score

## 1. Setup

In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
print(tf.__version__)

2.8.0


In [3]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## 2. Load Dataset

In [4]:
## import libraries

import itertools
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from sklearn import preprocessing
%matplotlib inline

In [5]:
## uploading csv files on drive (to avoid uploading on colab in every session)

from google.colab import drive
drive.mount("/content/drive/")

## drive path
train_path = "/content/drive/MyDrive/MetadataCSV/data_121655/CV_df_train_3.csv"
val_path = "/content/drive/MyDrive/MetadataCSV/data_121655/CV_df_val_3.csv"
test_path = "/content/drive/MyDrive/MetadataCSV/test_dataset.csv"

Mounted at /content/drive/


In [6]:
df_train = pd.read_csv(train_path) 
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(97324, 23)
(24331, 23)
(4624, 23)


In [7]:
df_train.head(2)

Unnamed: 0,App_Name,App_Id,Category,Rating,Maximum_Installs,Editors_Choice,Description,Privacy_Policy,Sensors,Camera,...,Contacts,SMS,Storage,Phone,Get_Accounts,Call_Log,desc_length,Clean_Description,clean_desc_length,token_length
0,Science Kids Learning - Be Super Scientist!,com.sciencegames.ScienceKidsLearning,Educational,4.2,78871,False,Science is all about inspiring curiosity and d...,https://nutboltgames.com/privacy-policy,0,0,...,0,0,1,0,0,0,1846,science is all about inspiring curiosity and d...,1807,343
1,Phuti kismat se Bhagyashali,com.photi.bhgysl,Lifestyle,4.4,13681,False,This year has in its fold mixed fate to offer ...,https://labtest12.wixsite.com/privacypolicy,0,0,...,0,0,0,0,0,0,1513,this year has in its fold mixed fate to offer ...,1443,350


In [8]:
target_list = ['Camera', 'Location', 'Microphone', 'Contacts', 'Storage', 'Phone', 'SMS', 'Call_Log', 'Calendar']

In [9]:
# getting number of nonzeros in each column
df_train[target_list].astype(bool).sum(axis=0)

Camera        16328
Location      16248
Microphone     8750
Contacts      10476
Storage       48050
Phone         15031
SMS             331
Call_Log        223
Calendar       1788
dtype: int64

In [10]:
df_val[target_list].astype(bool).sum(axis=0)

Camera         4125
Location       4094
Microphone     2291
Contacts       2644
Storage       12208
Phone          3846
SMS              82
Call_Log         43
Calendar        453
dtype: int64

In [11]:
df_test[target_list].astype(bool).sum(axis=0)

Camera         745
Location       716
Microphone     436
Contacts       500
Storage       2402
Phone          652
SMS             11
Call_Log         6
Calendar        90
dtype: int64

## 3. Data Preprocess

#### 3.1 Clean Text

In [12]:
import nltk

nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)

In [14]:
df_train["Clean_Description"] = df_train["Clean_Description"].map(remove_stopwords)
df_val["Clean_Description"] = df_val["Clean_Description"].map(remove_stopwords)
df_test["Clean_Description"] = df_test["Clean_Description"].map(remove_stopwords)

In [15]:
import string

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [16]:
df_train["Clean_Description"] = df_train["Clean_Description"].map(lambda x: remove_punct(x))
df_val["Clean_Description"] = df_val["Clean_Description"].map(lambda x: remove_punct(x))
df_test["Clean_Description"] = df_test["Clean_Description"].map(lambda x: remove_punct(x))

#### 3.2 Create Corpus

In [17]:
from keras.layers import *

In [18]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [19]:
from nltk.tokenize import word_tokenize
import nltk


def create_corpus_tk(df):
    corpus = []
    for text in df["Clean_Description"]:
        words = [word.lower() for word in word_tokenize(text)]
        corpus.append(words)
    return corpus

In [20]:
df_T = df_train.append(df_val)
df_T.reset_index(drop=True,inplace=True)

In [21]:
corpus = create_corpus_tk(df_T)

In [22]:
num_words = len(corpus)
print(num_words)

121655


In [23]:
corpus[0]

['science',
 'inspiring',
 'curiosity',
 'discovering',
 'world',
 'learn',
 'science',
 'fun',
 'educational',
 'games',
 'science',
 'games',
 'sure',
 'fulfill',
 'sense',
 'curiosity',
 'introduce',
 'basic',
 'science',
 'concepts',
 'approachable',
 'manner',
 'reinforcing',
 'comprehension',
 'engaging',
 'stimulating',
 'activities',
 'using',
 'interactive',
 'science',
 'games',
 'creates',
 'unique',
 'learning',
 'process',
 'combined',
 'conventional',
 'topics',
 'food',
 'chain',
 'learn',
 'food',
 'chain',
 'food',
 'web',
 'energy',
 'cycles',
 'transfers',
 'living',
 'organisms',
 'lesson',
 'learn',
 'simple',
 'food',
 'chains',
 'energy',
 'passed',
 'producers',
 'consumers',
 'decomposers',
 'eats',
 'what',
 'energy',
 'life',
 'needs',
 'survive',
 'come',
 'from',
 'learn',
 'producers',
 'primary',
 'consumers',
 'secondary',
 'tertiary',
 'consumers',
 'apex',
 'predators',
 'decomposers',
 'roles',
 'play',
 'food',
 'chains',
 'microorganisms',
 'ever',


##4. Train Validation Split

In [None]:
# split the data into a training set and a validation set

In [24]:
train_inputs = df_train["Clean_Description"]
validation_inputs = df_val["Clean_Description"]
train_labels = df_train[target_list]
validation_labels = df_val[target_list]

## 5. Tokenization

In [25]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_inputs)

In [26]:
MAX_SEQUENCE_LENGTH = 600

In [27]:
train_sequences = tokenizer.texts_to_sequences(train_inputs)

In [28]:
train_inputs

0        science inspiring curiosity discovering world ...
1        year fold mixed fate offer fellow human beings...
2        realpi provides best interesting pi calculatio...
3        regularly helps keep track repeating tasks fix...
4        harri live highly intuitive useful team schedu...
                               ...                        
97319    stylist wallpapers keyboard beautiful keyboard...
97320    consists precise easy understand meanings phys...
97321    may light guide dark places pendulum artifact ...
97322    leo browser made minimalism comfortable web br...
97323    boltnet vpn protects internet session helps ac...
Name: Clean_Description, Length: 97324, dtype: object

In [29]:
train_sequences

[[579,
  3114,
  6309,
  6332,
  92,
  31,
  579,
  96,
  466,
  71,
  579,
  71,
  416,
  4930,
  2215,
  6309,
  3294,
  389,
  579,
  1429,
  22529,
  1935,
  19322,
  3973,
  2414,
  8631,
  624,
  40,
  630,
  579,
  71,
  2828,
  192,
  75,
  535,
  3076,
  5985,
  382,
  427,
  3541,
  31,
  427,
  3541,
  427,
  338,
  995,
  4551,
  1793,
  1676,
  8035,
  1767,
  31,
  47,
  427,
  6685,
  995,
  4807,
  8595,
  4503,
  58159,
  13545,
  5864,
  995,
  81,
  423,
  6215,
  514,
  1369,
  31,
  8595,
  2317,
  4503,
  3171,
  20735,
  4503,
  1768,
  14010,
  58159,
  6310,
  67,
  427,
  6685,
  19732,
  493,
  6449,
  3280,
  136,
  12853,
  34084,
  34084,
  34084,
  19323,
  324,
  1851,
  2157,
  156,
  31,
  385,
  1676,
  3901,
  8035,
  18,
  10950,
  5060,
  13397,
  21838,
  36747,
  1209,
  356,
  291,
  2197,
  31,
  1676,
  539,
  1425,
  406,
  997,
  3,
  408,
  427,
  687,
  179,
  1209,
  41,
  406,
  997,
  398,
  9088,
  523,
  145,
  92,
  579,
  31,
  52,


In [30]:
train_padded = pad_sequences(
    train_sequences, maxlen=MAX_SEQUENCE_LENGTH, truncating="post", padding="post"
)

In [31]:
train_padded

array([[  579,  3114,  6309, ...,     0,     0,     0],
       [  277,  7589,  3403, ...,     0,     0,     0],
       [40146,   148,    15, ...,     0,     0,     0],
       ...,
       [  167,   296,   473, ...,     0,     0,     0],
       [ 4522,   602,   260, ...,     0,     0,     0],
       [  213,  3230,    82, ...,     0,     0,     0]], dtype=int32)

In [32]:
validation_sequences = tokenizer.texts_to_sequences(validation_inputs)
validation_padded = pad_sequences(
    validation_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [33]:
validation_padded

array([[ 7368,   138,   129, ...,     0,     0,     0],
       [33502,    17,  1645, ...,     0,     0,     0],
       [   47,  4170,     1, ...,     0,     0,     0],
       ...,
       [ 4293,     8,   993, ...,     0,     0,     0],
       [   58,  5745,    32, ...,     0,     0,     0],
       [74094,   365, 11940, ...,     0,     0,     0]], dtype=int32)

In [34]:
print(df_train.Clean_Description[0])
print(train_sequences[0])

science inspiring curiosity discovering world learn science fun educational games science games sure fulfill sense curiosity introduce basic science concepts approachable manner reinforcing comprehension engaging stimulating activities using interactive science games creates unique learning process combined conventional topics food chain learn food chain food web energy cycles transfers living organisms lesson learn simple food chains energy passed producers consumers decomposers eats what energy life needs survive come from learn producers primary consumers secondary tertiary consumers apex predators decomposers roles play food chains microorganisms ever wondered happens look microscope trillions trillions trillions microbes around earth maybe more learn types living micro organisms like bacteria viruses fungi algae protozoa human body health growth learn living things grow stay healthy use water food exercise provide human need stay healthy super scientist enter amazing world science

In [35]:
word_index = tokenizer.word_index
print("Number of unique words:", len(word_index))

Number of unique words: 196371


In [36]:
word_index

{'app': 1,
 'free': 2,
 'use': 3,
 'keyboard': 4,
 'wallpaper': 5,
 'theme': 6,
 'phone': 7,
 'application': 8,
 'features': 9,
 'new': 10,
 'wallpapers': 11,
 'get': 12,
 'time': 13,
 'also': 14,
 'best': 15,
 'screen': 16,
 'one': 17,
 'like': 18,
 'make': 19,
 'download': 20,
 'android': 21,
 'easy': 22,
 'launcher': 23,
 'us': 24,
 'english': 25,
 'live': 26,
 'mobile': 27,
 'find': 28,
 'share': 29,
 'help': 30,
 'learn': 31,
 'hd': 32,
 'set': 33,
 'love': 34,
 'device': 35,
 'please': 36,
 'video': 37,
 'want': 38,
 'apps': 39,
 'using': 40,
 'need': 41,
 'images': 42,
 'information': 43,
 'support': 44,
 'themes': 45,
 'home': 46,
 'simple': 47,
 'many': 48,
 'enjoy': 49,
 'friends': 50,
 'available': 51,
 'different': 52,
 'data': 53,
 'text': 54,
 'language': 55,
 'day': 56,
 'access': 57,
 'beautiful': 58,
 'game': 59,
 'easily': 60,
 'add': 61,
 'user': 62,
 'online': 63,
 'way': 64,
 'choose': 65,
 'you': 66,
 'play': 67,
 'create': 68,
 'words': 69,
 'every': 70,
 'games'

In [37]:
word_index["reason"]

2477

In [38]:
print(validation_sequences[0])

[7368, 138, 129, 732, 263, 69, 3495, 1017, 6157, 1503, 2041, 324, 92, 7368, 51, 21, 210, 712, 70, 3495, 150, 5478, 2972, 119, 108, 238, 916, 1783, 238, 1100, 35, 51, 144, 3818, 158, 636, 4388, 692, 2248, 308, 526, 3282, 1725, 910, 152, 247, 1038, 216, 1306, 821, 5227, 273, 1621, 4388, 14932, 845, 7368, 574, 324, 721, 1028, 3801, 3190, 1853, 56, 1076, 607, 4078, 2619, 329, 3495, 284, 7368, 4662, 69, 464, 1335, 88, 954, 1473, 1550, 1898, 31, 539, 31, 539, 31, 539, 1011, 90, 1497, 4448, 2458, 1695, 553, 566, 1894, 1648, 69, 759, 1002, 174, 2139, 1523, 3495, 615, 441, 241, 1844, 692, 2248, 577, 519, 1621, 4388, 7368, 254, 1348, 687, 55, 1776, 38, 219, 2654, 10, 1212, 1280, 1348, 234, 357, 3495, 25, 573, 12622, 10, 69, 247, 782, 425, 25, 3495, 1302, 5039, 498, 733, 250, 4931, 782, 289, 8045, 2145, 6584, 777, 14753, 614, 134, 4273, 110, 1911, 19, 255, 4005, 289, 939, 69, 56, 456, 430, 158, 13415, 6705, 5109, 56, 805, 351, 7274, 4718, 38, 31, 13, 4308, 253, 246, 57, 393, 1, 3495, 25, 269, 184

In [39]:
word_index["listen"]

392

## 6. Create the Embedding dictionary

In [40]:
embedding_dict = {}
with open("/content/drive/MyDrive/glove6B/glove.6B.300d.txt", "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
f.close()

In [41]:
embedding_dict

Output hidden; open in https://colab.research.google.com to view.

In [42]:
embedding_dim = 300

In [43]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [44]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.73247999, -0.078309  ,  0.018521  , ...,  0.21988   ,
        -0.13121   ,  0.043819  ],
       [-0.30414   , -0.37029999, -0.15881   , ..., -0.37830999,
        -0.42910001,  0.0030023 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.62230998, -0.40289   ,  0.11949   , ...,  0.93154001,
         0.22446001,  0.15016   ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [45]:
word_index["reason"]

2477

In [46]:
embedding_dict.get("reason")

array([ 1.3197e-01, -1.2591e-01,  4.3864e-02,  3.6321e-02,  9.6646e-02,
       -1.3829e-01,  3.8637e-01,  7.6962e-02, -1.1306e-01, -1.6083e+00,
        2.0062e-02, -5.2665e-02, -1.6597e-01,  1.2171e-01,  2.8945e-01,
       -1.7289e-01,  5.3035e-02, -2.7842e-01,  8.2376e-02, -1.1980e-02,
        3.7228e-02,  2.1867e-01,  1.5267e-01, -8.4361e-02, -3.1292e-01,
       -3.2093e-02,  2.0281e-01, -3.5910e-01,  1.6873e-02, -2.2996e-01,
        2.6044e-02,  3.5910e-01, -3.2431e-01, -5.4194e-01, -9.7742e-01,
        3.9198e-02, -1.7794e-01,  7.4200e-02, -4.1251e-02, -7.8917e-02,
        2.0646e-01, -6.6538e-02,  6.7401e-02,  1.4965e-01,  5.9107e-02,
       -3.7585e-02, -3.4672e-02,  5.5291e-02, -8.5636e-02,  9.1743e-02,
        4.9125e-01,  7.5606e-03, -3.0860e-01,  5.8902e-04, -8.6975e-02,
        3.9904e-01, -1.2695e-01,  2.2471e-01,  2.3658e-01,  3.0489e-01,
       -6.7363e-02,  3.5839e-01,  4.9703e-01,  4.1895e-01, -3.8494e-01,
       -2.6257e-01,  1.6049e-01, -1.0992e-01,  2.7477e-02,  1.49

In [47]:
(embedding_matrix[2363] == embedding_dict.get("reason")).all()

False

In [48]:
print(train_padded.shape)
print(train_labels.shape)

(97324, 600)
(97324, 9)


In [49]:
print(validation_padded.shape)
print(validation_labels.shape)

(24331, 600)
(24331, 9)


## 7. Build CNN Model

In [50]:
num_permissions = 9 #11
drop = 0.2

batch_size = 32
max_train_epochs = 300
validation_split = 6

early_stopping_patience = 16
early_stopping_delta = 0.02  # 2%

max_description_embeddings = 600
embedding_dim = 300  # +1 for flag

#downloaded_embedding_file = data_folder + "/word_embeddings/glove.6B.300d.txt"


conv_filters_num = 1024
conv_filters_sizes = [1, 2, 3]
dense_layers = [5000, 2500]
dropout = 0.2

heatmap_threshold = 0.49

In [51]:
# Function for class weights

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer


def generate_class_weights(class_series, multi_class=True, one_hot_encoded=False):
  """
  Method to generate class weights given a set of multi-class or multi-label labels, both one-hot-encoded or not.
  Some examples of different formats of class_series and their outputs are:
    - generate_class_weights(['mango', 'lemon', 'banana', 'mango'], multi_class=True, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 1.3333333333333333, 'mango': 0.6666666666666666}
    - generate_class_weights([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]], multi_class=True, one_hot_encoded=True)
    {0: 0.6666666666666666, 1: 1.3333333333333333, 2: 1.3333333333333333}
    - generate_class_weights([['mango', 'lemon'], ['mango'], ['lemon', 'banana'], ['lemon']], multi_class=False, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 0.4444444444444444, 'mango': 0.6666666666666666}
    - generate_class_weights([[0, 1, 1], [0, 0, 1], [1, 1, 0], [0, 1, 0]], multi_class=False, one_hot_encoded=True)
    {0: 1.3333333333333333, 1: 0.4444444444444444, 2: 0.6666666666666666}
  The output is a dictionary in the format { class_label: class_weight }. In case the input is one hot encoded, the class_label would be index
  of appareance of the label when the dataset was processed. 
  In multi_class this is np.unique(class_series) and in multi-label np.unique(np.concatenate(class_series)).
  Author: Angel Igareta (angel@igareta.com)
  """
  if multi_class:
    # If class is one hot encoded, transform to categorical labels to use compute_class_weight   
    if one_hot_encoded:
      class_series = np.argmax(class_series, axis=1)
  
    # Compute class weights with sklearn method
    class_labels = np.unique(class_series)
    class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=class_series)

    print(class_series)
    print(class_labels)

    return dict(zip(class_labels, class_weights))
  else:
    # It is neccessary that the multi-label values are one-hot encoded
    mlb = None
    if not one_hot_encoded:
      mlb = MultiLabelBinarizer()
      class_series = mlb.fit_transform(class_series)

    n_samples = len(class_series)
    n_classes = len(class_series[0])
    print(n_samples)
    print(n_classes)

    # Count each class frequency
    class_count = [0] * n_classes
    for classes in class_series:
        for index in range(n_classes):
            if classes[index] != 0:
                class_count[index] += 1
    
    # Compute class weights using balanced method
    class_weights = [n_samples / (n_classes * freq) if freq > 0 else 1 for freq in class_count]
    class_labels = range(len(class_weights)) if mlb is None else mlb.classes_
    return dict(zip(class_labels, class_weights))
    #return class_weights

In [52]:
class_series = np.array(train_labels)
class_wt = generate_class_weights(class_series, multi_class=False, one_hot_encoded=True)
print(class_wt)
#class_wt = torch.tensor(class_wt)
#print(class_wt)

97324
9
{0: 0.6622842833033916, 1: 0.6655451611138465, 2: 1.2358603174603175, 3: 1.0322430104789784, 4: 0.22505260723783096, 5: 0.7194316930196113, 6: 32.67002349781806, 7: 48.49227703039362, 8: 6.04797414864529}


In [53]:
def model_multiconv_1d(num_permissions):
    #embedding_dim = embedding_dim
    sequence_length = max_description_embeddings

    input_layer = Input(shape=(None,))

    conv_layers = []
    for filter_size in conv_filters_sizes:
        conv_layer_i = Embedding(num_words,
                                 output_dim=embedding_dim,
                                 input_length=sequence_length,
                                 weights=[embedding_matrix],
                                 trainable=False)(input_layer)
        conv_layer_i = Conv1D(filters=conv_filters_num,
                              kernel_size=filter_size,
                              padding='same',
                              activation='relu')(conv_layer_i)
        conv_layer_i = GlobalMaxPooling1D()(conv_layer_i)

        conv_layers.append(conv_layer_i)

    if len(conv_layers) == 1:
        previous_layer = conv_layers[0]
    else:
        concatenated_layer = concatenate(conv_layers, axis=-1)
        previous_layer = concatenated_layer

    for n_neurons in dense_layers:
        previous_layer = Dense(n_neurons, activation='relu')(previous_layer)
        previous_layer = Dropout(dropout)(previous_layer)

    output_layer = Dense(num_permissions, activation='sigmoid')(previous_layer)

    return keras.Model(inputs=input_layer, outputs=output_layer)

In [54]:
model = model_multiconv_1d(num_permissions)

In [55]:
model.compile(loss="binary_crossentropy", optimizer=Adam(0.0001), metrics=['accuracy'])
              
#metrics=[metrics.fb_micro, metrics.fb_macro, metrics.precision, metrics.recall])
train_metric = 'val_fb_macro'

In [56]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 300)    58911600    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 300)    58911600    ['input_1[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, None, 300)    58911600    ['input_1[0][0]']                
                                                                                              

In [57]:
checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')

model.fit(train_padded, train_labels, 
          batch_size=batch_size, epochs=10, class_weight=class_wt,
          verbose=1, callbacks=[checkpoint], validation_data=(validation_padded, validation_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f10df4f0e50>

## 8. Performance on Test Set

In [58]:
test_labels = df_test[target_list]
test_input = df_test["Clean_Description"]

In [59]:
test_sequences = tokenizer.texts_to_sequences(test_input)
test_padded = pad_sequences(
    test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [60]:
print(df_test.Clean_Description[0])
print(test_sequences[0])

poetry lover find best poetry poets urdu problem solved app app contains time best urdu poetry famous urdu poets offline urdu poetry collection best urdu poetry famous urdu poets like mohsin naqvi mirza ghalib saghir siddiqui others also contains romantic urdu poetry sad urdu poetry urdu ghazals offline urdu shayari urdu shayari app contains best ever poetry famous poets times app contains best variety urdu poetry categories romantic poetry sad poetry attitude poetry ghazals inspirational poetry main features offline poetry collection gallery view poetry collection zooming feature save share poetry images app contains offline poetry urdu ghazals urdu poetry poetry famous poets urdu shayari much more
[2676, 1933, 28, 15, 2676, 9198, 444, 680, 2447, 1, 1, 197, 13, 15, 444, 2676, 1020, 444, 9198, 144, 444, 2676, 124, 15, 444, 2676, 1020, 444, 9198, 18, 26223, 37069, 13184, 14482, 30374, 331, 14, 197, 556, 444, 2676, 1682, 444, 2676, 444, 10274, 144, 444, 639, 444, 639, 1, 197, 15, 493, 26

In [61]:
print(test_labels)

      Camera  Location  Microphone  Contacts  Storage  Phone  SMS  Call_Log  \
0          0         0           0         0        1      0    0         0   
1          0         0           0         0        1      0    0         0   
2          0         0           0         0        1      0    0         0   
3          0         0           0         0        0      0    0         0   
4          1         0           0         0        0      1    0         0   
...      ...       ...         ...       ...      ...    ...  ...       ...   
4619       0         1           0         0        0      0    0         0   
4620       0         0           0         0        0      0    0         0   
4621       0         0           0         0        1      0    0         0   
4622       0         1           1         0        1      0    0         0   
4623       0         0           0         0        0      0    0         0   

      Calendar  
0            0  
1            0  


In [62]:
print(test_padded.shape)
print(test_labels.shape)

(4624, 600)
(4624, 9)


In [63]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

predictions = model.predict(test_padded)
print("predictions shape:", predictions.shape)

predictions shape: (4624, 9)


In [64]:
predictions[0:10]

array([[1.52012435e-04, 3.31086310e-04, 3.36040102e-06, 2.41869361e-06,
        6.12020016e-01, 3.77426128e-04, 4.59488801e-11, 1.03585319e-14,
        3.77623127e-10],
       [2.01721250e-05, 8.50324359e-05, 1.09611283e-04, 6.62798629e-05,
        6.81042112e-03, 3.44099186e-04, 1.33109711e-06, 9.38076616e-10,
        9.95450705e-07],
       [8.84955943e-01, 2.93402404e-01, 1.36270881e-01, 3.06787133e-01,
        9.88987625e-01, 2.35018700e-01, 2.21617025e-04, 4.13450944e-05,
        1.34447159e-03],
       [8.96905419e-07, 1.05495434e-04, 1.87649539e-06, 4.47243598e-04,
        1.72627717e-02, 8.93141783e-04, 4.10105883e-07, 6.23041951e-10,
        3.25768184e-07],
       [5.76294549e-02, 3.48743743e-05, 2.00436334e-04, 1.32547307e-03,
        7.44317695e-02, 2.44714413e-02, 9.93447702e-06, 3.61696308e-08,
        1.71520696e-07],
       [5.11535883e-01, 3.92195536e-03, 2.39587072e-02, 4.61845547e-02,
        9.83274639e-01, 2.73931008e-02, 6.28846283e-06, 3.59104860e-07,
        2.2

In [65]:
true_labels = test_labels.to_numpy()
true_labels[0:10]

array([[0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0]])

In [66]:
np.save("predictions.npy", predictions)
loaded_predictions = np.load("predictions.npy")
print(loaded_predictions)

[[1.5201244e-04 3.3108631e-04 3.3604010e-06 ... 4.5948880e-11
  1.0358532e-14 3.7762313e-10]
 [2.0172125e-05 8.5032436e-05 1.0961128e-04 ... 1.3310971e-06
  9.3807662e-10 9.9545070e-07]
 [8.8495594e-01 2.9340240e-01 1.3627088e-01 ... 2.2161703e-04
  4.1345094e-05 1.3444716e-03]
 ...
 [9.7202629e-01 8.0283590e-02 7.3179948e-01 ... 3.0272655e-04
  1.5743701e-04 9.6832355e-03]
 [1.5140799e-03 9.4847471e-01 9.9889636e-01 ... 9.8710679e-15
  2.8865920e-17 4.4724670e-06]
 [1.1791209e-05 1.0423958e-04 7.0151546e-06 ... 6.2604677e-08
  4.5762585e-11 5.3439848e-08]]


In [67]:
np.save("true_labels.npy", true_labels)
loaded_true_labels = np.load("true_labels.npy")
print(loaded_true_labels)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## 9. Threshold Calculation

In [68]:
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for data simulation
from sklearn.datasets import make_classification     # Create a synthetic dataframe
from sklearn.linear_model import LogisticRegression  # Classification model
from sklearn.model_selection import train_test_split # Split the dataframe
from sklearn.metrics import roc_curve                # Calculate the ROC curve
from sklearn.metrics import precision_recall_curve   # Calculate the Precision-Recall curve
from sklearn.metrics import f1_score                 # Calculate the F-score
# Import module for data visualization
from plotnine import *
import plotnine

In [69]:
thresh_f = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
thresh_roc = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [70]:
n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("predictions.npy")
  true_labels = np.load("true_labels.npy")

  # Array for finding the optimal threshold
  thresholds = np.arange(0.0, 1.0, 0.0001)
  fscore = np.zeros(shape=(len(thresholds)))
  print('Length of sequence: {}'.format(len(thresholds)))

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Fit the model
  for index, elem in enumerate(thresholds):
    # Corrected probabilities
    y_pred_prob = (pred > elem).astype('int')
    # Calculate the f-score
    fscore[index] = f1_score(labels, y_pred_prob)

  # Find the optimal threshold
  index = np.argmax(fscore)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  fscoreOpt = round(fscore[index], ndigits = 4)
  thresh_f[i] = thresholdOpt
  print('Best Threshold: {} with F-Score: {}'.format(thresholdOpt, fscoreOpt))

print("-------------------------------------")
print("optimal threshold tuning for f-score")
print(thresh_f)

permission  0
Length of sequence: 10000
Best Threshold: 0.2941 with F-Score: 0.6795
permission  1
Length of sequence: 10000
Best Threshold: 0.0684 with F-Score: 0.5586
permission  2
Length of sequence: 10000
Best Threshold: 0.1487 with F-Score: 0.5724
permission  3
Length of sequence: 10000
Best Threshold: 0.1646 with F-Score: 0.516
permission  4
Length of sequence: 10000
Best Threshold: 0.064 with F-Score: 0.7908
permission  5
Length of sequence: 10000
Best Threshold: 0.0464 with F-Score: 0.5134
permission  6
Length of sequence: 10000
Best Threshold: 0.0554 with F-Score: 0.0909
permission  7
Length of sequence: 10000
Best Threshold: 0.0324 with F-Score: 0.5333
permission  8
Length of sequence: 10000
Best Threshold: 0.0261 with F-Score: 0.2968
-------------------------------------
optimal threshold tuning for f-score
[0.2941, 0.0684, 0.1487, 0.1646, 0.064, 0.0464, 0.0554, 0.0324, 0.0261]


In [71]:
##for roc curve with g-mean

n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("predictions.npy")
  true_labels = np.load("true_labels.npy")

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Create the ROC curve
  fpr, tpr, thresholds = roc_curve(labels, pred)

  df_fpr_tpr = pd.DataFrame({'FPR':fpr, 'TPR':tpr, 'Threshold':thresholds})

  # Calculate the G-mean
  gmean = np.sqrt(tpr * (1 - fpr))

  # Find the optimal threshold
  index = np.argmax(gmean)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  gmeanOpt = round(gmean[index], ndigits = 4)
  fprOpt = round(fpr[index], ndigits = 4)
  tprOpt = round(tpr[index], ndigits = 4)

  thresh_roc[i] = thresholdOpt
  print('Best Threshold: {} with G-Mean: {}'.format(thresholdOpt, gmeanOpt))
  print('FPR: {}, TPR: {}'.format(fprOpt, tprOpt))

print("-------------------------------------")
print("ROC curve with G-mean threshold tuning")
print(thresh_roc)

permission  0
Best Threshold: 0.039900001138448715 with G-Mean: 0.8417
FPR: 0.1431, TPR: 0.8268
permission  1
Best Threshold: 0.010400000028312206 with G-Mean: 0.7722
FPR: 0.2034, TPR: 0.7486
permission  2
Best Threshold: 0.009600000455975533 with G-Mean: 0.8132
FPR: 0.1289, TPR: 0.7592
permission  3
Best Threshold: 0.006500000134110451 with G-Mean: 0.7912
FPR: 0.2015, TPR: 0.784
permission  4
Best Threshold: 0.3393999934196472 with G-Mean: 0.7755
FPR: 0.2129, TPR: 0.7639
permission  5
Best Threshold: 0.010700000450015068 with G-Mean: 0.7562
FPR: 0.2329, TPR: 0.7454
permission  6
Best Threshold: 0.0 with G-Mean: 0.8296
FPR: 0.3117, TPR: 1.0
permission  7
Best Threshold: 0.0 with G-Mean: 0.8632
FPR: 0.2549, TPR: 1.0
permission  8
Best Threshold: 9.999999747378752e-05 with G-Mean: 0.8056
FPR: 0.1888, TPR: 0.8
-------------------------------------
ROC curve with G-mean threshold tuning
[0.0399, 0.0104, 0.0096, 0.0065, 0.3394, 0.0107, 0.0, 0.0, 1e-04]


## 10. Performance Score

In [None]:
#Fscore micro for different thresholds-

In [72]:
#predictions = np.load("predictions.npy")
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [73]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import f1_score

def f_at_1(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_f[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_f[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_f[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_f[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_f[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_f[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_f[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_f[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_f[8]

   
    acc[0] = f1_score(labels[:, 0], preds_th[:, 0])
    acc[1] = f1_score(labels[:, 1], preds_th[:, 1])
    acc[2] = f1_score(labels[:, 2], preds_th[:, 2])
    acc[3] = f1_score(labels[:, 3], preds_th[:, 3])
    acc[4] = f1_score(labels[:, 4], preds_th[:, 4])
    acc[5] = f1_score(labels[:, 5], preds_th[:, 5])
    acc[6] = f1_score(labels[:, 6], preds_th[:, 6])
    acc[7] = f1_score(labels[:, 7], preds_th[:, 7])
    acc[8] = f1_score(labels[:, 8], preds_th[:, 8])

   #f1_score_micro = metrics.f1_score(targets, outputs, average='micro')

    #print(acc)
    return acc

In [74]:
eval_accuracy = f_at_1(predictions, true_labels)

np.save("F1_CV3_N121k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.
print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")
avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 score: {0:.4f}".format(avg_score))

  Camera    : 0.6795
  Location  : 0.5586
  Microphone: 0.5724
  Contacts  : 0.5160
  Storage   : 0.7908
  Phone     : 0.5134
  SMS       : 0.0909
  Call_Log  : 0.5333
  Calendar  : 0.2968

  Average F1 score: 0.5058


In [None]:
#Fscore micro for different thresholds-

In [75]:
#predictions = np.load("predictions.npy")
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [76]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import f1_score

def f1micro_accuracy(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_f[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_f[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_f[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_f[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_f[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_f[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_f[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_f[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_f[8]

    acc[0] = f1_score(labels[:, 0], preds_th[:, 0], average='micro')
    acc[1] = f1_score(labels[:, 1], preds_th[:, 1], average='micro')
    acc[2] = f1_score(labels[:, 2], preds_th[:, 2], average='micro')
    acc[3] = f1_score(labels[:, 3], preds_th[:, 3], average='micro')
    acc[4] = f1_score(labels[:, 4], preds_th[:, 4], average='micro')
    acc[5] = f1_score(labels[:, 5], preds_th[:, 5], average='micro')
    acc[6] = f1_score(labels[:, 6], preds_th[:, 6], average='micro')
    acc[7] = f1_score(labels[:, 7], preds_th[:, 7], average='micro')
    acc[8] = f1_score(labels[:, 8], preds_th[:, 8], average='micro')
    
   #f1_score_micro = metrics.f1_score(targets, outputs, average='micro')

    #print(acc)
    return acc

In [77]:
eval_accuracy = f1micro_accuracy(predictions, true_labels)

np.save("F1Mic_CV3_N121k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 (micro) score: {0:.4f}".format(avg_score))

  Camera    : 0.8919
  Location  : 0.8486
  Microphone: 0.9208
  Contacts  : 0.8953
  Storage   : 0.7591
  Phone     : 0.8430
  SMS       : 0.9957
  Call_Log  : 0.9985
  Calendar  : 0.9764

  Average F1 (micro) score: 0.9033


In [None]:
#roc-auc score for different thresholds-

In [78]:
import numpy as np
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [79]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import roc_auc_score

def roc_auc(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_roc[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_roc[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_roc[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_roc[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_roc[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_roc[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_roc[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_roc[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_roc[8]


    acc[0] = roc_auc_score(labels[:, 0], preds_th[:, 0])
    acc[1] = roc_auc_score(labels[:, 1], preds_th[:, 1])
    acc[2] = roc_auc_score(labels[:, 2], preds_th[:, 2])
    acc[3] = roc_auc_score(labels[:, 3], preds_th[:, 3])
    acc[4] = roc_auc_score(labels[:, 4], preds_th[:, 4])
    acc[5] = roc_auc_score(labels[:, 5], preds_th[:, 5])
    acc[6] = roc_auc_score(labels[:, 6], preds_th[:, 6])
    acc[7] = roc_auc_score(labels[:, 7], preds_th[:, 7])
    acc[8] = roc_auc_score(labels[:, 8], preds_th[:, 8])
   

    #print(acc)
    return acc

In [80]:
#roc-auc score

eval_accuracy = roc_auc(predictions, true_labels)

np.save("ROC_CV3_N121k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average ROC_AUC score: {0:.4f}".format(avg_score))

  Camera    : 0.8412
  Location  : 0.7712
  Microphone: 0.8140
  Contacts  : 0.7912
  Storage   : 0.7753
  Phone     : 0.7561
  SMS       : 0.5000
  Call_Log  : 0.5000
  Calendar  : 0.7894

  Average ROC_AUC score: 0.7265


# ACNET performance

#### a) Data Preparation

In [81]:
## drive path
acnet_path = "/content/drive/MyDrive/MetadataCSV/acnet_dataset_preprocess.csv"

In [82]:
df_acnet = pd.read_csv(acnet_path) 
print(df_acnet.shape)

(1417, 14)


In [83]:
df_acnet = df_acnet.dropna(subset=['Clean_Description'])

In [84]:
df_acnet.head()

Unnamed: 0,app_id,description,Storage,Contacts,Location,Camera,Microphone,SMS,Call_Log,Phone,Calendar,Settings,Tasks,Clean_Description
0,0,ROOT is REQUIRED for automatic synchronization...,1,0,1,0,0,0,0,0,0,0,0,root is required for automatic synchronization...
1,1,This app delivers short scriptures containing ...,0,0,0,0,0,1,0,0,0,0,0,this app delivers short scriptures containing ...
2,2,This game is surprisingly simple and very addi...,0,0,0,0,0,0,0,0,0,0,0,this game is surprisingly simple and very addi...
3,3,It is an online RPG game based on LBS location...,0,0,1,0,0,0,0,0,0,0,0,it is an online rpg game based on lbs location...
4,4,Christmas is in the air. Get yourself in the h...,0,1,0,0,0,0,0,0,0,1,1,christmas is in the air. get yourself in the h...


In [85]:
df_acnet["Clean_Description"] = df_acnet["Clean_Description"].map(remove_stopwords)

In [86]:
df_acnet["Clean_Description"] = df_acnet["Clean_Description"].map(lambda x: remove_punct(x))

In [87]:
acnet_labels = df_acnet[target_list]
acnet_input = df_acnet["Clean_Description"]

In [88]:
acnet_sequences = tokenizer.texts_to_sequences(acnet_input)
acnet_padded = pad_sequences(
    acnet_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [89]:
print(acnet_padded.shape)
print(acnet_labels.shape)

(1414, 600)
(1414, 9)


#### b) Get Predictions

In [90]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

predictions = model.predict(acnet_padded)
print("predictions shape:", predictions.shape)

predictions shape: (1414, 9)


In [91]:
acnet_labels = acnet_labels.to_numpy()

In [92]:
np.save("acnet_predictions.npy", predictions)
loaded_predictions = np.load("acnet_predictions.npy")

np.save("acnet_labels.npy", acnet_labels)
loaded_true_labels = np.load("acnet_labels.npy")

#### c) Threshold Calculation

In [93]:
thresh_f = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
thresh_roc = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [94]:
n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("acnet_predictions.npy")
  true_labels = np.load("acnet_labels.npy")

  # Array for finding the optimal threshold
  thresholds = np.arange(0.0, 1.0, 0.0001)
  fscore = np.zeros(shape=(len(thresholds)))
  print('Length of sequence: {}'.format(len(thresholds)))

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Fit the model
  for index, elem in enumerate(thresholds):
    # Corrected probabilities
    y_pred_prob = (pred > elem).astype('int')
    # Calculate the f-score
    fscore[index] = f1_score(labels, y_pred_prob)

  # Find the optimal threshold
  index = np.argmax(fscore)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  fscoreOpt = round(fscore[index], ndigits = 4)
  thresh_f[i] = thresholdOpt
  print('Best Threshold: {} with F-Score: {}'.format(thresholdOpt, fscoreOpt))

print("-------------------------------------")
print("optimal threshold tuning for f-score")
print(thresh_f)

permission  0
Length of sequence: 10000
Best Threshold: 0.7481 with F-Score: 0.4623
permission  1
Length of sequence: 10000
Best Threshold: 0.0574 with F-Score: 0.5547
permission  2
Length of sequence: 10000
Best Threshold: 0.2244 with F-Score: 0.4967
permission  3
Length of sequence: 10000
Best Threshold: 0.0435 with F-Score: 0.6279
permission  4
Length of sequence: 10000
Best Threshold: 0.0077 with F-Score: 0.5854
permission  5
Length of sequence: 10000
Best Threshold: 0.3556 with F-Score: 0.439
permission  6
Length of sequence: 10000
Best Threshold: 0.0015 with F-Score: 0.5077
permission  7
Length of sequence: 10000
Best Threshold: 0.0031 with F-Score: 0.38
permission  8
Length of sequence: 10000
Best Threshold: 0.0024 with F-Score: 0.381
-------------------------------------
optimal threshold tuning for f-score
[0.7481, 0.0574, 0.2244, 0.0435, 0.0077, 0.3556, 0.0015, 0.0031, 0.0024]


In [95]:
##for roc curve with g-mean

n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("acnet_predictions.npy")
  true_labels = np.load("acnet_labels.npy")

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Create the ROC curve
  fpr, tpr, thresholds = roc_curve(labels, pred)

  df_fpr_tpr = pd.DataFrame({'FPR':fpr, 'TPR':tpr, 'Threshold':thresholds})

  # Calculate the G-mean
  gmean = np.sqrt(tpr * (1 - fpr))

  # Find the optimal threshold
  index = np.argmax(gmean)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  gmeanOpt = round(gmean[index], ndigits = 4)
  fprOpt = round(fpr[index], ndigits = 4)
  tprOpt = round(tpr[index], ndigits = 4)

  thresh_roc[i] = thresholdOpt
  print('Best Threshold: {} with G-Mean: {}'.format(thresholdOpt, gmeanOpt))
  print('FPR: {}, TPR: {}'.format(fprOpt, tprOpt))

print("-------------------------------------")
print("ROC curve with G-mean threshold tuning")
print(thresh_roc)

permission  0
Best Threshold: 0.024399999529123306 with G-Mean: 0.7196
FPR: 0.3485, TPR: 0.7948
permission  1
Best Threshold: 0.01730000041425228 with G-Mean: 0.735
FPR: 0.2982, TPR: 0.7697
permission  2
Best Threshold: 0.010900000110268593 with G-Mean: 0.7587
FPR: 0.2393, TPR: 0.7568
permission  3
Best Threshold: 0.043800000101327896 with G-Mean: 0.7312
FPR: 0.2298, TPR: 0.6943
permission  4
Best Threshold: 0.34290000796318054 with G-Mean: 0.5543
FPR: 0.4934, TPR: 0.6066
permission  5
Best Threshold: 0.05469999834895134 with G-Mean: 0.767
FPR: 0.2631, TPR: 0.7982
permission  6
Best Threshold: 9.999999747378752e-05 with G-Mean: 0.7733
FPR: 0.2837, TPR: 0.8349
permission  7
Best Threshold: 0.0 with G-Mean: 0.7798
FPR: 0.305, TPR: 0.875
permission  8
Best Threshold: 0.00039999998989515007 with G-Mean: 0.7709
FPR: 0.2275, TPR: 0.7692
-------------------------------------
ROC curve with G-mean threshold tuning
[0.0244, 0.0173, 0.0109, 0.0438, 0.3429, 0.0547, 1e-04, 0.0, 0.0004]


#### d) Accuracy Score

In [None]:
#Fscore micro for different thresholds-

In [96]:
#predictions = np.load("predictions.npy")
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [97]:
eval_accuracy = f_at_1(predictions, true_labels)

np.save("ACNET_F1_CV3_N121k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.
print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")
avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 score: {0:.4f}".format(avg_score))

  Camera    : 0.4623
  Location  : 0.5547
  Microphone: 0.4967
  Contacts  : 0.6279
  Storage   : 0.5854
  Phone     : 0.4390
  SMS       : 0.5077
  Call_Log  : 0.3800
  Calendar  : 0.3810

  Average F1 score: 0.4927


In [None]:
#Fscore micro for different thresholds-

In [98]:
#predictions = np.load("predictions.npy")
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [99]:
eval_accuracy = f1micro_accuracy(predictions, true_labels)

np.save("ACNET_F1Mic_CV3_N121k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 (micro) score: {0:.4f}".format(avg_score))

  Camera    : 0.8437
  Location  : 0.7638
  Microphone: 0.8925
  Contacts  : 0.7468
  Storage   : 0.4540
  Phone     : 0.9024
  SMS       : 0.8423
  Call_Log  : 0.9123
  Calendar  : 0.8529

  Average F1 (micro) score: 0.8012


In [None]:
#roc-auc score for different thresholds-

In [100]:
import numpy as np
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [101]:
#roc-auc score

eval_accuracy = roc_auc(predictions, true_labels)

np.save("ACNET_ROC_CV3_N121k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average ROC_AUC score: {0:.4f}".format(avg_score))

  Camera    : 0.7209
  Location  : 0.7353
  Microphone: 0.7583
  Contacts  : 0.7322
  Storage   : 0.5566
  Phone     : 0.7676
  SMS       : 0.7659
  Call_Log  : 0.5000
  Calendar  : 0.7691

  Average ROC_AUC score: 0.7007
