Cross Validation 5

size =  20247

test size 4k+

epoch-5

apply sigmoid (prediction probabilities are logits)

groups-9

target_list = ['Camera', 'Location', 'Microphone', 'Contacts', 'Storage', 'Phone', 'SMS', 'Call_Log', 'Calendar']

threshold-tuning = yes

df_2k = df[(df['Rating'] >= 4.5) & (df['Maximum_Installs'] >= 20000)]

accuracy score : F1 score (micro) and ROC_AUC score

## 1. Setup

In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
print(tf.__version__)

2.8.0


In [3]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


## 2. Load Dataset

In [4]:
## import libraries

import itertools
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from sklearn import preprocessing
%matplotlib inline

In [5]:
## uploading csv files on drive (to avoid uploading on colab in every session)

from google.colab import drive
drive.mount("/content/drive/")

## drive path
train_path = "/content/drive/MyDrive/MetadataCSV/data_20247/CV_df_train_5.csv"
val_path = "/content/drive/MyDrive/MetadataCSV/data_20247/CV_df_val_5.csv"
test_path = "/content/drive/MyDrive/MetadataCSV/test_dataset.csv"

Mounted at /content/drive/


In [6]:
df_train = pd.read_csv(train_path) 
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(16197, 23)
(4050, 23)
(4624, 23)


In [7]:
df_train.head(2)

Unnamed: 0,App_Name,App_Id,Category,Rating,Maximum_Installs,Editors_Choice,Description,Privacy_Policy,Sensors,Camera,...,Contacts,SMS,Storage,Phone,Get_Accounts,Call_Log,desc_length,Clean_Description,clean_desc_length,token_length
0,Maritime Schedule,com.dreambrunomsn.escalamaritimo,Tools,4.8,21456,False,Maritime Schedule allows you to create a work ...,,0,0,...,0,0,0,0,0,0,1836,maritime schedule allows you to create a work ...,1798,380
1,FREE Polish by Nemo,com.nemoapps.android.polish,Education,4.5,115431,False,Nemo is designed to start you speaking the mos...,http://nemoapps.com/privacy,0,0,...,0,0,0,0,0,0,2833,nemo is designed to start you speaking the mos...,2762,540


In [8]:
target_list = ['Camera', 'Location', 'Microphone', 'Contacts', 'Storage', 'Phone', 'SMS', 'Call_Log', 'Calendar']

In [9]:
# getting number of nonzeros in each column
df_train[target_list].astype(bool).sum(axis=0)

Camera        2507
Location      2483
Microphone    1500
Contacts      1656
Storage       8228
Phone         2443
SMS             57
Call_Log        34
Calendar       294
dtype: int64

In [10]:
df_val[target_list].astype(bool).sum(axis=0)

Camera         591
Location       597
Microphone     350
Contacts       394
Storage       2025
Phone          556
SMS             14
Call_Log         9
Calendar        58
dtype: int64

In [11]:
df_test[target_list].astype(bool).sum(axis=0)

Camera         745
Location       716
Microphone     436
Contacts       500
Storage       2402
Phone          652
SMS             11
Call_Log         6
Calendar        90
dtype: int64

## 3. Data Preprocess

#### 3.1 Clean Text

In [12]:
import nltk

nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)

In [14]:
df_train["Clean_Description"] = df_train["Clean_Description"].map(remove_stopwords)
df_val["Clean_Description"] = df_val["Clean_Description"].map(remove_stopwords)
df_test["Clean_Description"] = df_test["Clean_Description"].map(remove_stopwords)

In [15]:
import string

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [16]:
df_train["Clean_Description"] = df_train["Clean_Description"].map(lambda x: remove_punct(x))
df_val["Clean_Description"] = df_val["Clean_Description"].map(lambda x: remove_punct(x))
df_test["Clean_Description"] = df_test["Clean_Description"].map(lambda x: remove_punct(x))

#### 3.2 Create Corpus

In [17]:
from keras.layers import *

In [18]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [19]:
from nltk.tokenize import word_tokenize
import nltk


def create_corpus_tk(df):
    corpus = []
    for text in df["Clean_Description"]:
        words = [word.lower() for word in word_tokenize(text)]
        corpus.append(words)
    return corpus

In [20]:
df_T = df_train.append(df_val)
df_T.reset_index(drop=True,inplace=True)

In [21]:
corpus = create_corpus_tk(df_T)

In [22]:
num_words = len(corpus)
print(num_words)

20247


In [23]:
corpus[0]

['maritime',
 'schedule',
 'allows',
 'create',
 'work',
 'schedule',
 'simple',
 'fast',
 'way',
 'making',
 'easier',
 'control',
 'days',
 'work',
 'easy',
 'remember',
 'date',
 'next',
 'boarding',
 'disembark',
 'would',
 'already',
 'know',
 'going',
 'home',
 'childrens',
 'birthdays',
 'christmas',
 'special',
 'date',
 'need',
 'enter',
 'schedule',
 'boarding',
 'date',
 'app',
 'generate',
 'work',
 'schedule',
 'next',
 'years',
 'also',
 'count',
 'calendar',
 'appointment',
 'register',
 'appointments',
 'never',
 'forget',
 'special',
 'date',
 'like',
 'wedding',
 'anniversary',
 'dentists',
 'appointment',
 'next',
 'updates',
 'added',
 'function',
 'alert',
 'appointments',
 'function',
 'allow',
 'save',
 'compare',
 'friends',
 'work',
 'schedules',
 'plan',
 'meeting',
 'barbecue',
 'board',
 'current',
 'version',
 'create',
 'work',
 'schedule',
 'easy',
 'access',
 'boarding',
 'disembark',
 'days',
 'days',
 'off',
 'applies',
 'you',
 'pre',
 'boarding',
 'm

##4. Train Validation Split

In [None]:
# split the data into a training set and a validation set

In [24]:
train_inputs = df_train["Clean_Description"]
validation_inputs = df_val["Clean_Description"]
train_labels = df_train[target_list]
validation_labels = df_val[target_list]

## 5. Tokenization

In [25]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_inputs)

In [26]:
MAX_SEQUENCE_LENGTH = 600

In [27]:
train_sequences = tokenizer.texts_to_sequences(train_inputs)

In [28]:
train_inputs

0        maritime schedule allows create work schedule ...
1        nemo designed start speaking useful words poli...
2        wps app check security wi fi networks testing ...
3        cadenas parts solid edge app download service ...
4        woman calendar  every woman needs  user interf...
                               ...                        
16192    time workouts plans hybrid interval timer app ...
16193    ever twitch play loudest gachi want express lo...
16194      synonyms relationships entertainment birds v...
16195    colorful lion theme gives best keyboard backgr...
16196    keep track weekly k run results parkrunner ana...
Name: Clean_Description, Length: 16197, dtype: object

In [29]:
train_sequences

[[12867,
  756,
  216,
  73,
  118,
  756,
  52,
  74,
  72,
  480,
  421,
  343,
  278,
  118,
  25,
  798,
  277,
  209,
  11024,
  448,
  800,
  131,
  635,
  39,
  2936,
  3333,
  379,
  301,
  277,
  44,
  699,
  756,
  11024,
  277,
  1,
  1329,
  118,
  756,
  209,
  673,
  17,
  1215,
  232,
  2755,
  1280,
  2992,
  183,
  606,
  301,
  277,
  19,
  1761,
  2485,
  13695,
  2755,
  209,
  475,
  401,
  537,
  1177,
  2992,
  537,
  670,
  85,
  1435,
  63,
  118,
  2460,
  432,
  2394,
  8889,
  1060,
  303,
  202,
  73,
  118,
  756,
  25,
  60,
  11024,
  278,
  278,
  2199,
  5023,
  66,
  1442,
  11024,
  2394,
  477,
  52,
  908,
  72,
  129,
  42,
  42,
  1,
  269,
  73,
  2460,
  823,
  278,
  540,
  2460,
  759,
  823,
  42,
  1426,
  17,
  53,
  278,
  278,
  1329,
  809,
  118,
  756,
  1053,
  202,
  800,
  49,
  18,
  73,
  498,
  7,
  1,
  232,
  2992,
  1029,
  118,
  756,
  42,
  69,
  201,
  5024,
  1151,
  50,
  3233,
  445,
  3856,
  2811,
  515,
  1762,
  9,

In [30]:
train_padded = pad_sequences(
    train_sequences, maxlen=MAX_SEQUENCE_LENGTH, truncating="post", padding="post"
)

In [31]:
train_padded

array([[12867,   756,   216, ...,     0,     0,     0],
       [ 2766,   157,   127, ...,     0,     0,     0],
       [ 4916,     1,    99, ...,     0,     0,     0],
       ...,
       [ 3922,  2587,   959, ...,     0,     0,     0],
       [  217,   757,     2, ...,     0,     0,     0],
       [  121,   197,   735, ...,     0,     0,     0]], dtype=int32)

In [32]:
validation_sequences = tokenizer.texts_to_sequences(validation_inputs)
validation_padded = pad_sequences(
    validation_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [33]:
validation_padded

array([[   37,    30,   515, ...,     0,     0,     0],
       [   18,  1892,  1028, ...,     0,     0,     0],
       [  153,     1,  2196, ...,     0,     0,     0],
       ...,
       [    6,   566,   922, ...,     0,     0,     0],
       [   62,     5,  1670, ...,     0,     0,     0],
       [  173,    42, 11307, ...,     0,     0,     0]], dtype=int32)

In [34]:
print(df_train.Clean_Description[0])
print(train_sequences[0])

maritime schedule allows create work schedule simple fast way making easier control days work easy remember date next boarding disembark would already know going home childrens birthdays christmas special date need enter schedule boarding date app generate work schedule next years also count calendar appointment register appointments never forget special date like wedding anniversary dentists appointment next updates added function alert appointments function allow save compare friends work schedules plan meeting barbecue board current version create work schedule easy access boarding disembark days days off applies you pre boarding meeting everything simple intuitive way differing status day collors day app initiates option create schedules format days common schedules edit format day months also configurate different woking days days generate customized work schedule second version already available one create events use app calendar appointments inside work schedule collors day easi

In [35]:
word_index = tokenizer.word_index
print("Number of unique words:", len(word_index))

Number of unique words: 66708


In [36]:
word_index

{'app': 1,
 'theme': 2,
 'keyboard': 3,
 'free': 4,
 'wallpaper': 5,
 'phone': 6,
 'use': 7,
 'wallpapers': 8,
 'new': 9,
 'features': 10,
 'application': 11,
 'get': 12,
 'time': 13,
 'make': 14,
 'launcher': 15,
 'android': 16,
 'also': 17,
 'one': 18,
 'like': 19,
 'hd': 20,
 'download': 21,
 'screen': 22,
 'best': 23,
 'us': 24,
 'easy': 25,
 'english': 26,
 'live': 27,
 'learn': 28,
 'themes': 29,
 'find': 30,
 'love': 31,
 'help': 32,
 'share': 33,
 'mobile': 34,
 'please': 35,
 'set': 36,
 'want': 37,
 'apps': 38,
 'home': 39,
 'support': 40,
 'device': 41,
 'day': 42,
 'words': 43,
 'need': 44,
 'enjoy': 45,
 'language': 46,
 'using': 47,
 'images': 48,
 'available': 49,
 'many': 50,
 'note': 51,
 'simple': 52,
 'different': 53,
 'beautiful': 54,
 'add': 55,
 'text': 56,
 'apply': 57,
 'information': 58,
 'etc': 59,
 'access': 60,
 'daily': 61,
 'background': 62,
 'friends': 63,
 'every': 64,
 'stickers': 65,
 'you': 66,
 'video': 67,
 'install': 68,
 'easily': 69,
 'life': 70,

In [37]:
word_index["reason"]

2414

In [38]:
print(validation_sequences[0])

[37, 30, 515, 1, 398, 19, 137, 2114, 2114, 6091, 1362, 167, 1362, 46, 52, 72, 18797, 120, 52, 7, 337, 6, 122, 259, 855, 59, 11, 1491, 959, 1036, 2056, 5104, 3679, 515]


In [39]:
word_index["listen"]

296

## 6. Create the Embedding dictionary

In [40]:
embedding_dict = {}
with open("/content/drive/MyDrive/glove6B/glove.6B.300d.txt", "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
f.close()

In [41]:
embedding_dict

Output hidden; open in https://colab.research.google.com to view.

In [42]:
embedding_dim = 300

In [43]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [44]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.73247999, -0.078309  ,  0.018521  , ...,  0.21988   ,
        -0.13121   ,  0.043819  ],
       [-0.25198001, -0.15583999, -0.4535    , ...,  0.40935999,
        -0.49140999, -0.096712  ],
       ...,
       [-0.12616999,  0.047188  , -0.045178  , ...,  0.11071   ,
         0.35317001, -0.20630001],
       [-0.16889   , -0.16912   ,  0.47323   , ..., -0.091788  ,
        -0.14264999, -0.68147999],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [45]:
word_index["reason"]

2414

In [46]:
embedding_dict.get("reason")

array([ 1.3197e-01, -1.2591e-01,  4.3864e-02,  3.6321e-02,  9.6646e-02,
       -1.3829e-01,  3.8637e-01,  7.6962e-02, -1.1306e-01, -1.6083e+00,
        2.0062e-02, -5.2665e-02, -1.6597e-01,  1.2171e-01,  2.8945e-01,
       -1.7289e-01,  5.3035e-02, -2.7842e-01,  8.2376e-02, -1.1980e-02,
        3.7228e-02,  2.1867e-01,  1.5267e-01, -8.4361e-02, -3.1292e-01,
       -3.2093e-02,  2.0281e-01, -3.5910e-01,  1.6873e-02, -2.2996e-01,
        2.6044e-02,  3.5910e-01, -3.2431e-01, -5.4194e-01, -9.7742e-01,
        3.9198e-02, -1.7794e-01,  7.4200e-02, -4.1251e-02, -7.8917e-02,
        2.0646e-01, -6.6538e-02,  6.7401e-02,  1.4965e-01,  5.9107e-02,
       -3.7585e-02, -3.4672e-02,  5.5291e-02, -8.5636e-02,  9.1743e-02,
        4.9125e-01,  7.5606e-03, -3.0860e-01,  5.8902e-04, -8.6975e-02,
        3.9904e-01, -1.2695e-01,  2.2471e-01,  2.3658e-01,  3.0489e-01,
       -6.7363e-02,  3.5839e-01,  4.9703e-01,  4.1895e-01, -3.8494e-01,
       -2.6257e-01,  1.6049e-01, -1.0992e-01,  2.7477e-02,  1.49

In [47]:
(embedding_matrix[2363] == embedding_dict.get("reason")).all()

False

In [48]:
print(train_padded.shape)
print(train_labels.shape)

(16197, 600)
(16197, 9)


In [49]:
print(validation_padded.shape)
print(validation_labels.shape)

(4050, 600)
(4050, 9)


## 7. Build CNN Model

In [50]:
num_permissions = 9 #11
drop = 0.2

batch_size = 32
max_train_epochs = 300
validation_split = 6

early_stopping_patience = 16
early_stopping_delta = 0.02  # 2%

max_description_embeddings = 600
embedding_dim = 300  # +1 for flag

#downloaded_embedding_file = data_folder + "/word_embeddings/glove.6B.300d.txt"


conv_filters_num = 1024
conv_filters_sizes = [1, 2, 3]
dense_layers = [5000, 2500]
dropout = 0.2

heatmap_threshold = 0.49

In [51]:
# Function for class weights

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer


def generate_class_weights(class_series, multi_class=True, one_hot_encoded=False):
  """
  Method to generate class weights given a set of multi-class or multi-label labels, both one-hot-encoded or not.
  Some examples of different formats of class_series and their outputs are:
    - generate_class_weights(['mango', 'lemon', 'banana', 'mango'], multi_class=True, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 1.3333333333333333, 'mango': 0.6666666666666666}
    - generate_class_weights([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]], multi_class=True, one_hot_encoded=True)
    {0: 0.6666666666666666, 1: 1.3333333333333333, 2: 1.3333333333333333}
    - generate_class_weights([['mango', 'lemon'], ['mango'], ['lemon', 'banana'], ['lemon']], multi_class=False, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 0.4444444444444444, 'mango': 0.6666666666666666}
    - generate_class_weights([[0, 1, 1], [0, 0, 1], [1, 1, 0], [0, 1, 0]], multi_class=False, one_hot_encoded=True)
    {0: 1.3333333333333333, 1: 0.4444444444444444, 2: 0.6666666666666666}
  The output is a dictionary in the format { class_label: class_weight }. In case the input is one hot encoded, the class_label would be index
  of appareance of the label when the dataset was processed. 
  In multi_class this is np.unique(class_series) and in multi-label np.unique(np.concatenate(class_series)).
  Author: Angel Igareta (angel@igareta.com)
  """
  if multi_class:
    # If class is one hot encoded, transform to categorical labels to use compute_class_weight   
    if one_hot_encoded:
      class_series = np.argmax(class_series, axis=1)
  
    # Compute class weights with sklearn method
    class_labels = np.unique(class_series)
    class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=class_series)

    print(class_series)
    print(class_labels)

    return dict(zip(class_labels, class_weights))
  else:
    # It is neccessary that the multi-label values are one-hot encoded
    mlb = None
    if not one_hot_encoded:
      mlb = MultiLabelBinarizer()
      class_series = mlb.fit_transform(class_series)

    n_samples = len(class_series)
    n_classes = len(class_series[0])
    print(n_samples)
    print(n_classes)

    # Count each class frequency
    class_count = [0] * n_classes
    for classes in class_series:
        for index in range(n_classes):
            if classes[index] != 0:
                class_count[index] += 1
    
    # Compute class weights using balanced method
    class_weights = [n_samples / (n_classes * freq) if freq > 0 else 1 for freq in class_count]
    class_labels = range(len(class_weights)) if mlb is None else mlb.classes_
    return dict(zip(class_labels, class_weights))
    #return class_weights

In [52]:
class_series = np.array(train_labels)
class_wt = generate_class_weights(class_series, multi_class=False, one_hot_encoded=True)
print(class_wt)
#class_wt = torch.tensor(class_wt)
#print(class_wt)

16197
9
{0: 0.7178566679962771, 1: 0.7247952745334945, 2: 1.1997777777777778, 3: 1.086755233494364, 4: 0.21872467995462647, 5: 0.7366625733387911, 6: 31.573099415204677, 7: 52.931372549019606, 8: 6.121315192743764}


In [53]:
def model_multiconv_1d(num_permissions):
    #embedding_dim = embedding_dim
    sequence_length = max_description_embeddings

    input_layer = Input(shape=(None,))

    conv_layers = []
    for filter_size in conv_filters_sizes:
        conv_layer_i = Embedding(num_words,
                                 output_dim=embedding_dim,
                                 input_length=sequence_length,
                                 weights=[embedding_matrix],
                                 trainable=False)(input_layer)
        conv_layer_i = Conv1D(filters=conv_filters_num,
                              kernel_size=filter_size,
                              padding='same',
                              activation='relu')(conv_layer_i)
        conv_layer_i = GlobalMaxPooling1D()(conv_layer_i)

        conv_layers.append(conv_layer_i)

    if len(conv_layers) == 1:
        previous_layer = conv_layers[0]
    else:
        concatenated_layer = concatenate(conv_layers, axis=-1)
        previous_layer = concatenated_layer

    for n_neurons in dense_layers:
        previous_layer = Dense(n_neurons, activation='relu')(previous_layer)
        previous_layer = Dropout(dropout)(previous_layer)

    output_layer = Dense(num_permissions, activation='sigmoid')(previous_layer)

    return keras.Model(inputs=input_layer, outputs=output_layer)

In [54]:
model = model_multiconv_1d(num_permissions)

In [55]:
model.compile(loss="binary_crossentropy", optimizer=Adam(0.0001), metrics=['accuracy'])
              
#metrics=[metrics.fb_micro, metrics.fb_macro, metrics.precision, metrics.recall])
train_metric = 'val_fb_macro'

In [56]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 300)    20012700    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 300)    20012700    ['input_1[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, None, 300)    20012700    ['input_1[0][0]']                
                                                                                              

In [57]:
checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')

model.fit(train_padded, train_labels, 
          batch_size=batch_size, epochs=10, class_weight=class_wt,
          verbose=1, callbacks=[checkpoint], validation_data=(validation_padded, validation_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd686307910>

## 8. Performance on Test Set

In [58]:
test_labels = df_test[target_list]
test_input = df_test["Clean_Description"]

In [59]:
test_sequences = tokenizer.texts_to_sequences(test_input)
test_padded = pad_sequences(
    test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [60]:
print(df_test.Clean_Description[0])
print(test_sequences[0])

poetry lover find best poetry poets urdu problem solved app app contains time best urdu poetry famous urdu poets offline urdu poetry collection best urdu poetry famous urdu poets like mohsin naqvi mirza ghalib saghir siddiqui others also contains romantic urdu poetry sad urdu poetry urdu ghazals offline urdu shayari urdu shayari app contains best ever poetry famous poets times app contains best variety urdu poetry categories romantic poetry sad poetry attitude poetry ghazals inspirational poetry main features offline poetry collection gallery view poetry collection zooming feature save share poetry images app contains offline poetry urdu ghazals urdu poetry poetry famous poets urdu shayari much more
[2236, 1875, 30, 23, 2236, 9155, 385, 783, 2059, 1, 1, 212, 13, 23, 385, 2236, 1146, 385, 9155, 123, 385, 2236, 167, 23, 385, 2236, 1146, 385, 9155, 19, 19455, 19456, 407, 17, 212, 439, 385, 2236, 1977, 385, 2236, 385, 10212, 123, 385, 812, 385, 812, 1, 212, 23, 539, 2236, 1146, 9155, 211, 

In [61]:
print(test_labels)

      Camera  Location  Microphone  Contacts  Storage  Phone  SMS  Call_Log  \
0          0         0           0         0        1      0    0         0   
1          0         0           0         0        1      0    0         0   
2          0         0           0         0        1      0    0         0   
3          0         0           0         0        0      0    0         0   
4          1         0           0         0        0      1    0         0   
...      ...       ...         ...       ...      ...    ...  ...       ...   
4619       0         1           0         0        0      0    0         0   
4620       0         0           0         0        0      0    0         0   
4621       0         0           0         0        1      0    0         0   
4622       0         1           1         0        1      0    0         0   
4623       0         0           0         0        0      0    0         0   

      Calendar  
0            0  
1            0  


In [62]:
print(test_padded.shape)
print(test_labels.shape)

(4624, 600)
(4624, 9)


In [63]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

predictions = model.predict(test_padded)
print("predictions shape:", predictions.shape)

predictions shape: (4624, 9)


In [64]:
predictions[0:10]

array([[8.36591862e-05, 2.13497265e-06, 8.33477588e-06, 3.78633886e-05,
        1.65141210e-01, 1.16224773e-03, 7.21997964e-08, 1.12417124e-08,
        2.84451346e-07],
       [1.40011743e-05, 4.50364954e-04, 8.22213565e-07, 8.32406877e-05,
        3.17390147e-03, 2.88878544e-03, 4.87750810e-07, 2.28584440e-07,
        7.45186162e-06],
       [1.62858456e-01, 1.81284472e-02, 1.00508975e-02, 8.13456655e-01,
        9.40704286e-01, 5.01726747e-01, 3.26744304e-03, 7.14632159e-04,
        5.05006919e-03],
       [5.18173510e-06, 8.41982604e-04, 1.72889671e-07, 1.95692070e-02,
        4.45868522e-02, 1.15161845e-02, 6.98058557e-06, 2.61238938e-06,
        6.17099504e-05],
       [2.23437644e-04, 1.16761810e-04, 3.49048186e-08, 2.01755036e-02,
        1.25137228e-03, 1.11485971e-02, 2.45435258e-05, 9.07334197e-06,
        5.87515060e-05],
       [1.42946467e-02, 4.66137222e-04, 2.58008396e-04, 3.93300235e-01,
        8.99538279e-01, 4.96397279e-02, 1.67655147e-04, 2.57652082e-05,
        4.6

In [65]:
true_labels = test_labels.to_numpy()
true_labels[0:10]

array([[0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0]])

In [66]:
np.save("predictions.npy", predictions)
loaded_predictions = np.load("predictions.npy")
print(loaded_predictions)

[[8.36591862e-05 2.13497265e-06 8.33477588e-06 ... 7.21997964e-08
  1.12417124e-08 2.84451346e-07]
 [1.40011743e-05 4.50364954e-04 8.22213565e-07 ... 4.87750810e-07
  2.28584440e-07 7.45186162e-06]
 [1.62858456e-01 1.81284472e-02 1.00508975e-02 ... 3.26744304e-03
  7.14632159e-04 5.05006919e-03]
 ...
 [9.37701046e-01 1.40994610e-02 3.80530268e-01 ... 2.20023561e-03
  3.69404326e-04 3.08303721e-02]
 [1.29595472e-04 9.98853087e-01 9.99597013e-01 ... 2.28435546e-08
  1.84590920e-08 3.28264432e-03]
 [6.35269657e-07 1.04955340e-07 1.50127391e-07 ... 5.61769475e-08
  1.52969086e-08 5.73971010e-06]]


In [67]:
np.save("true_labels.npy", true_labels)
loaded_true_labels = np.load("true_labels.npy")
print(loaded_true_labels)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## 9. Threshold Calculation

In [68]:
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for data simulation
from sklearn.datasets import make_classification     # Create a synthetic dataframe
from sklearn.linear_model import LogisticRegression  # Classification model
from sklearn.model_selection import train_test_split # Split the dataframe
from sklearn.metrics import roc_curve                # Calculate the ROC curve
from sklearn.metrics import precision_recall_curve   # Calculate the Precision-Recall curve
from sklearn.metrics import f1_score                 # Calculate the F-score
# Import module for data visualization
from plotnine import *
import plotnine

In [69]:
thresh_f = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
thresh_roc = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [70]:
n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("predictions.npy")
  true_labels = np.load("true_labels.npy")

  # Array for finding the optimal threshold
  thresholds = np.arange(0.0, 1.0, 0.0001)
  fscore = np.zeros(shape=(len(thresholds)))
  print('Length of sequence: {}'.format(len(thresholds)))

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Fit the model
  for index, elem in enumerate(thresholds):
    # Corrected probabilities
    y_pred_prob = (pred > elem).astype('int')
    # Calculate the f-score
    fscore[index] = f1_score(labels, y_pred_prob)

  # Find the optimal threshold
  index = np.argmax(fscore)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  fscoreOpt = round(fscore[index], ndigits = 4)
  thresh_f[i] = thresholdOpt
  print('Best Threshold: {} with F-Score: {}'.format(thresholdOpt, fscoreOpt))

print("-------------------------------------")
print("optimal threshold tuning for f-score")
print(thresh_f)

permission  0
Length of sequence: 10000
Best Threshold: 0.0762 with F-Score: 0.6904
permission  1
Length of sequence: 10000
Best Threshold: 0.0216 with F-Score: 0.5889
permission  2
Length of sequence: 10000
Best Threshold: 0.0096 with F-Score: 0.5663
permission  3
Length of sequence: 10000
Best Threshold: 0.2847 with F-Score: 0.5302
permission  4
Length of sequence: 10000
Best Threshold: 0.081 with F-Score: 0.7958
permission  5
Length of sequence: 10000
Best Threshold: 0.1319 with F-Score: 0.5107
permission  6
Length of sequence: 10000
Best Threshold: 0.0638 with F-Score: 0.0784
permission  7
Length of sequence: 10000
Best Threshold: 0.1512 with F-Score: 0.2857
permission  8
Length of sequence: 10000
Best Threshold: 0.0749 with F-Score: 0.3758
-------------------------------------
optimal threshold tuning for f-score
[0.0762, 0.0216, 0.0096, 0.2847, 0.081, 0.1319, 0.0638, 0.1512, 0.0749]


In [71]:
##for roc curve with g-mean

n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("predictions.npy")
  true_labels = np.load("true_labels.npy")

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Create the ROC curve
  fpr, tpr, thresholds = roc_curve(labels, pred)

  df_fpr_tpr = pd.DataFrame({'FPR':fpr, 'TPR':tpr, 'Threshold':thresholds})

  # Calculate the G-mean
  gmean = np.sqrt(tpr * (1 - fpr))

  # Find the optimal threshold
  index = np.argmax(gmean)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  gmeanOpt = round(gmean[index], ndigits = 4)
  fprOpt = round(fpr[index], ndigits = 4)
  tprOpt = round(tpr[index], ndigits = 4)

  thresh_roc[i] = thresholdOpt
  print('Best Threshold: {} with G-Mean: {}'.format(thresholdOpt, gmeanOpt))
  print('FPR: {}, TPR: {}'.format(fprOpt, tprOpt))

print("-------------------------------------")
print("ROC curve with G-mean threshold tuning")
print(thresh_roc)

permission  0
Best Threshold: 0.006399999838322401 with G-Mean: 0.8418
FPR: 0.1498, TPR: 0.8336
permission  1
Best Threshold: 0.002899999963119626 with G-Mean: 0.7797
FPR: 0.1661, TPR: 0.7291
permission  2
Best Threshold: 0.0003000000142492354 with G-Mean: 0.8075
FPR: 0.1712, TPR: 0.7867
permission  3
Best Threshold: 0.03350000083446503 with G-Mean: 0.8066
FPR: 0.2105, TPR: 0.824
permission  4
Best Threshold: 0.30480000376701355 with G-Mean: 0.7843
FPR: 0.1818, TPR: 0.7519
permission  5
Best Threshold: 0.017799999564886093 with G-Mean: 0.752
FPR: 0.2382, TPR: 0.7423
permission  6
Best Threshold: 0.00039999998989515007 with G-Mean: 0.9214
FPR: 0.1511, TPR: 1.0
permission  7
Best Threshold: 0.0003000000142492354 with G-Mean: 0.8667
FPR: 0.0985, TPR: 0.8333
permission  8
Best Threshold: 0.0017999999690800905 with G-Mean: 0.8143
FPR: 0.1595, TPR: 0.7889
-------------------------------------
ROC curve with G-mean threshold tuning
[0.0064, 0.0029, 0.0003, 0.0335, 0.3048, 0.0178, 0.0004, 0.00

## 10. Performance Score

In [None]:
#Fscore micro for different thresholds-

In [72]:
#predictions = np.load("predictions.npy")
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [73]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import f1_score

def f_at_1(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_f[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_f[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_f[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_f[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_f[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_f[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_f[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_f[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_f[8]

   
    acc[0] = f1_score(labels[:, 0], preds_th[:, 0])
    acc[1] = f1_score(labels[:, 1], preds_th[:, 1])
    acc[2] = f1_score(labels[:, 2], preds_th[:, 2])
    acc[3] = f1_score(labels[:, 3], preds_th[:, 3])
    acc[4] = f1_score(labels[:, 4], preds_th[:, 4])
    acc[5] = f1_score(labels[:, 5], preds_th[:, 5])
    acc[6] = f1_score(labels[:, 6], preds_th[:, 6])
    acc[7] = f1_score(labels[:, 7], preds_th[:, 7])
    acc[8] = f1_score(labels[:, 8], preds_th[:, 8])

   #f1_score_micro = metrics.f1_score(targets, outputs, average='micro')

    #print(acc)
    return acc

In [74]:
eval_accuracy = f_at_1(predictions, true_labels)

np.save("F1_CV5_N20k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.
print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")
avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 score: {0:.4f}".format(avg_score))

  Camera    : 0.6904
  Location  : 0.5889
  Microphone: 0.5663
  Contacts  : 0.5302
  Storage   : 0.7958
  Phone     : 0.5107
  SMS       : 0.0784
  Call_Log  : 0.2857
  Calendar  : 0.3758

  Average F1 score: 0.4914


In [None]:
#Fscore micro for different thresholds-

In [75]:
#predictions = np.load("predictions.npy")
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [76]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import f1_score

def f1micro_accuracy(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_f[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_f[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_f[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_f[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_f[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_f[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_f[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_f[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_f[8]

    acc[0] = f1_score(labels[:, 0], preds_th[:, 0], average='micro')
    acc[1] = f1_score(labels[:, 1], preds_th[:, 1], average='micro')
    acc[2] = f1_score(labels[:, 2], preds_th[:, 2], average='micro')
    acc[3] = f1_score(labels[:, 3], preds_th[:, 3], average='micro')
    acc[4] = f1_score(labels[:, 4], preds_th[:, 4], average='micro')
    acc[5] = f1_score(labels[:, 5], preds_th[:, 5], average='micro')
    acc[6] = f1_score(labels[:, 6], preds_th[:, 6], average='micro')
    acc[7] = f1_score(labels[:, 7], preds_th[:, 7], average='micro')
    acc[8] = f1_score(labels[:, 8], preds_th[:, 8], average='micro')
    
   #f1_score_micro = metrics.f1_score(targets, outputs, average='micro')

    #print(acc)
    return acc

In [77]:
eval_accuracy = f1micro_accuracy(predictions, true_labels)

np.save("F1Mic_CV5_N20k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 (micro) score: {0:.4f}".format(avg_score))

  Camera    : 0.8945
  Location  : 0.8681
  Microphone: 0.9115
  Contacts  : 0.8789
  Storage   : 0.7788
  Phone     : 0.8471
  SMS       : 0.9898
  Call_Log  : 0.9989
  Calendar  : 0.9777

  Average F1 (micro) score: 0.9050


In [None]:
#roc-auc score for different thresholds-

In [78]:
import numpy as np
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [79]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import roc_auc_score

def roc_auc(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_roc[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_roc[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_roc[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_roc[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_roc[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_roc[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_roc[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_roc[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_roc[8]


    acc[0] = roc_auc_score(labels[:, 0], preds_th[:, 0])
    acc[1] = roc_auc_score(labels[:, 1], preds_th[:, 1])
    acc[2] = roc_auc_score(labels[:, 2], preds_th[:, 2])
    acc[3] = roc_auc_score(labels[:, 3], preds_th[:, 3])
    acc[4] = roc_auc_score(labels[:, 4], preds_th[:, 4])
    acc[5] = roc_auc_score(labels[:, 5], preds_th[:, 5])
    acc[6] = roc_auc_score(labels[:, 6], preds_th[:, 6])
    acc[7] = roc_auc_score(labels[:, 7], preds_th[:, 7])
    acc[8] = roc_auc_score(labels[:, 8], preds_th[:, 8])
   

    #print(acc)
    return acc

In [80]:
#roc-auc score

eval_accuracy = roc_auc(predictions, true_labels)

np.save("ROC_CV5_N20k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average ROC_AUC score: {0:.4f}".format(avg_score))

  Camera    : 0.8419
  Location  : 0.7794
  Microphone: 0.8068
  Contacts  : 0.8068
  Storage   : 0.7850
  Phone     : 0.7518
  SMS       : 0.9224
  Call_Log  : 0.8658
  Calendar  : 0.8139

  Average ROC_AUC score: 0.8193


# ACNET performance

#### a) Data Preparation

In [81]:
## drive path
acnet_path = "/content/drive/MyDrive/MetadataCSV/acnet_dataset_preprocess.csv"

In [82]:
df_acnet = pd.read_csv(acnet_path) 
print(df_acnet.shape)

(1417, 14)


In [83]:
df_acnet = df_acnet.dropna(subset=['Clean_Description'])

In [84]:
df_acnet.head()

Unnamed: 0,app_id,description,Storage,Contacts,Location,Camera,Microphone,SMS,Call_Log,Phone,Calendar,Settings,Tasks,Clean_Description
0,0,ROOT is REQUIRED for automatic synchronization...,1,0,1,0,0,0,0,0,0,0,0,root is required for automatic synchronization...
1,1,This app delivers short scriptures containing ...,0,0,0,0,0,1,0,0,0,0,0,this app delivers short scriptures containing ...
2,2,This game is surprisingly simple and very addi...,0,0,0,0,0,0,0,0,0,0,0,this game is surprisingly simple and very addi...
3,3,It is an online RPG game based on LBS location...,0,0,1,0,0,0,0,0,0,0,0,it is an online rpg game based on lbs location...
4,4,Christmas is in the air. Get yourself in the h...,0,1,0,0,0,0,0,0,0,1,1,christmas is in the air. get yourself in the h...


In [85]:
df_acnet["Clean_Description"] = df_acnet["Clean_Description"].map(remove_stopwords)

In [86]:
df_acnet["Clean_Description"] = df_acnet["Clean_Description"].map(lambda x: remove_punct(x))

In [87]:
acnet_labels = df_acnet[target_list]
acnet_input = df_acnet["Clean_Description"]

In [88]:
acnet_sequences = tokenizer.texts_to_sequences(acnet_input)
acnet_padded = pad_sequences(
    acnet_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [89]:
print(acnet_padded.shape)
print(acnet_labels.shape)

(1414, 600)
(1414, 9)


#### b) Get Predictions

In [90]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

predictions = model.predict(acnet_padded)
print("predictions shape:", predictions.shape)

predictions shape: (1414, 9)


In [91]:
acnet_labels = acnet_labels.to_numpy()

In [92]:
np.save("acnet_predictions.npy", predictions)
loaded_predictions = np.load("acnet_predictions.npy")

np.save("acnet_labels.npy", acnet_labels)
loaded_true_labels = np.load("acnet_labels.npy")

#### c) Threshold Calculation

In [93]:
thresh_f = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
thresh_roc = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [94]:
n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("acnet_predictions.npy")
  true_labels = np.load("acnet_labels.npy")

  # Array for finding the optimal threshold
  thresholds = np.arange(0.0, 1.0, 0.0001)
  fscore = np.zeros(shape=(len(thresholds)))
  print('Length of sequence: {}'.format(len(thresholds)))

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Fit the model
  for index, elem in enumerate(thresholds):
    # Corrected probabilities
    y_pred_prob = (pred > elem).astype('int')
    # Calculate the f-score
    fscore[index] = f1_score(labels, y_pred_prob)

  # Find the optimal threshold
  index = np.argmax(fscore)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  fscoreOpt = round(fscore[index], ndigits = 4)
  thresh_f[i] = thresholdOpt
  print('Best Threshold: {} with F-Score: {}'.format(thresholdOpt, fscoreOpt))

print("-------------------------------------")
print("optimal threshold tuning for f-score")
print(thresh_f)

permission  0
Length of sequence: 10000
Best Threshold: 0.0698 with F-Score: 0.4479
permission  1
Length of sequence: 10000
Best Threshold: 0.0039 with F-Score: 0.5222
permission  2
Length of sequence: 10000
Best Threshold: 0.0463 with F-Score: 0.5217
permission  3
Length of sequence: 10000
Best Threshold: 0.1402 with F-Score: 0.6073
permission  4
Length of sequence: 10000
Best Threshold: 0.0002 with F-Score: 0.5844
permission  5
Length of sequence: 10000
Best Threshold: 0.5799 with F-Score: 0.3411
permission  6
Length of sequence: 10000
Best Threshold: 0.0008 with F-Score: 0.4964
permission  7
Length of sequence: 10000
Best Threshold: 0.0006 with F-Score: 0.3478
permission  8
Length of sequence: 10000
Best Threshold: 0.0298 with F-Score: 0.5263
-------------------------------------
optimal threshold tuning for f-score
[0.0698, 0.0039, 0.0463, 0.1402, 0.0002, 0.5799, 0.0008, 0.0006, 0.0298]


In [95]:
##for roc curve with g-mean

n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("acnet_predictions.npy")
  true_labels = np.load("acnet_labels.npy")

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Create the ROC curve
  fpr, tpr, thresholds = roc_curve(labels, pred)

  df_fpr_tpr = pd.DataFrame({'FPR':fpr, 'TPR':tpr, 'Threshold':thresholds})

  # Calculate the G-mean
  gmean = np.sqrt(tpr * (1 - fpr))

  # Find the optimal threshold
  index = np.argmax(gmean)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  gmeanOpt = round(gmean[index], ndigits = 4)
  fprOpt = round(fpr[index], ndigits = 4)
  tprOpt = round(tpr[index], ndigits = 4)

  thresh_roc[i] = thresholdOpt
  print('Best Threshold: {} with G-Mean: {}'.format(thresholdOpt, gmeanOpt))
  print('FPR: {}, TPR: {}'.format(fprOpt, tprOpt))

print("-------------------------------------")
print("ROC curve with G-mean threshold tuning")
print(thresh_roc)

permission  0
Best Threshold: 0.013100000098347664 with G-Mean: 0.6954
FPR: 0.2667, TPR: 0.6594
permission  1
Best Threshold: 0.0019000000320374966 with G-Mean: 0.7199
FPR: 0.3324, TPR: 0.7763
permission  2
Best Threshold: 0.0006000000284984708 with G-Mean: 0.7492
FPR: 0.2583, TPR: 0.7568
permission  3
Best Threshold: 0.14090000092983246 with G-Mean: 0.7151
FPR: 0.2584, TPR: 0.6897
permission  4
Best Threshold: 0.25920000672340393 with G-Mean: 0.5538
FPR: 0.4504, TPR: 0.5581
permission  5
Best Threshold: 0.09239999949932098 with G-Mean: 0.7067
FPR: 0.2792, TPR: 0.693
permission  6
Best Threshold: 0.00039999998989515007 with G-Mean: 0.7614
FPR: 0.2271, TPR: 0.75
permission  7
Best Threshold: 9.999999747378752e-05 with G-Mean: 0.7692
FPR: 0.2527, TPR: 0.7917
permission  8
Best Threshold: 0.010700000450015068 with G-Mean: 0.8577
FPR: 0.1206, TPR: 0.8365
-------------------------------------
ROC curve with G-mean threshold tuning
[0.0131, 0.0019, 0.0006, 0.1409, 0.2592, 0.0924, 0.0004, 1e-

#### d) Accuracy Score

In [None]:
#Fscore micro for different thresholds-

In [96]:
#predictions = np.load("predictions.npy")
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [97]:
eval_accuracy = f_at_1(predictions, true_labels)

np.save("ACNET_F1_CV5_N20k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.
print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")
avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 score: {0:.4f}".format(avg_score))

  Camera    : 0.4479
  Location  : 0.5222
  Microphone: 0.5217
  Contacts  : 0.6073
  Storage   : 0.5844
  Phone     : 0.3411
  SMS       : 0.4964
  Call_Log  : 0.3478
  Calendar  : 0.5263

  Average F1 score: 0.4884


In [None]:
#Fscore micro for different thresholds-

In [98]:
#predictions = np.load("predictions.npy")
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [99]:
eval_accuracy = f1micro_accuracy(predictions, true_labels)

np.save("ACNET_F1Mic_CV5_N20k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 (micro) score: {0:.4f}".format(avg_score))

  Camera    : 0.8013
  Location  : 0.7256
  Microphone: 0.9066
  Contacts  : 0.7256
  Storage   : 0.4257
  Phone     : 0.8798
  SMS       : 0.8006
  Call_Log  : 0.8409
  Calendar  : 0.9173

  Average F1 (micro) score: 0.7804


In [None]:
#roc-auc score for different thresholds-

In [100]:
import numpy as np
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [101]:
#roc-auc score

eval_accuracy = roc_auc(predictions, true_labels)

np.save("ACNET_ROC_CV5_N20k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average ROC_AUC score: {0:.4f}".format(avg_score))

  Camera    : 0.6964
  Location  : 0.7219
  Microphone: 0.7492
  Contacts  : 0.7156
  Storage   : 0.5530
  Phone     : 0.7025
  SMS       : 0.7598
  Call_Log  : 0.7566
  Calendar  : 0.8532

  Average ROC_AUC score: 0.7231
