Cross Validation 2

size =  72684

test size 4k+

epoch-5

apply sigmoid (prediction probabilities are logits)

groups-9

target_list = ['Camera', 'Location', 'Microphone', 'Contacts', 'Storage', 'Phone', 'SMS', 'Call_Log', 'Calendar']

threshold-tuning = yes

df_2k = df[(df['Rating'] >= 4.0) & (df['Maximum_Installs'] >= 20000)]

accuracy score : F1 score (micro) and ROC_AUC score

## 1. Setup

In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
print(tf.__version__)

2.8.0


In [3]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


## 2. Load Dataset

In [4]:
## import libraries

import itertools
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from sklearn import preprocessing
%matplotlib inline

In [5]:
## uploading csv files on drive (to avoid uploading on colab in every session)

from google.colab import drive
drive.mount("/content/drive/")

## drive path
train_path = "/content/drive/MyDrive/MetadataCSV/data_72684/CV_df_train_2.csv"
val_path = "/content/drive/MyDrive/MetadataCSV/data_72684/CV_df_val_2.csv"
test_path = "/content/drive/MyDrive/MetadataCSV/test_dataset.csv"

Mounted at /content/drive/


In [6]:
df_train = pd.read_csv(train_path) 
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(58147, 23)
(14537, 23)
(4624, 23)


In [7]:
df_train.head(2)

Unnamed: 0,App_Name,App_Id,Category,Rating,Maximum_Installs,Editors_Choice,Description,Privacy_Policy,Sensors,Camera,...,Contacts,SMS,Storage,Phone,Get_Accounts,Call_Log,desc_length,Clean_Description,clean_desc_length,token_length
0,Face Chat,com.facechat.live,Social,4.1,7306795,False,Enjoy chatting? Social and Video Chat on Face ...,https://sites.google.com/view/xender-chat-term...,0,1,...,0,0,1,1,0,0,1090,enjoy chatting? social and video chat on face ...,1025,197
1,Running - Calorie Counter,com.sdgcode.runningcaloriecounter,Health & Fitness,4.1,286133,False,"Running Counter - Calorie Counter app, your wa...",http://app.sdgcode.com/privacy-policy/,0,0,...,0,0,0,0,0,0,845,"running counter calorie counter app, your way ...",816,162


In [8]:
target_list = ['Camera', 'Location', 'Microphone', 'Contacts', 'Storage', 'Phone', 'SMS', 'Call_Log', 'Calendar']

In [9]:
# getting number of nonzeros in each column
df_train[target_list].astype(bool).sum(axis=0)

Camera         9208
Location       9278
Microphone     5391
Contacts       6381
Storage       28434
Phone          9009
SMS             285
Call_Log        180
Calendar       1031
dtype: int64

In [10]:
df_val[target_list].astype(bool).sum(axis=0)

Camera        2329
Location      2244
Microphone    1395
Contacts      1553
Storage       7174
Phone         2227
SMS             67
Call_Log        47
Calendar       264
dtype: int64

In [11]:
df_test[target_list].astype(bool).sum(axis=0)

Camera         745
Location       716
Microphone     436
Contacts       500
Storage       2402
Phone          652
SMS             11
Call_Log         6
Calendar        90
dtype: int64

## 3. Data Preprocess

#### 3.1 Clean Text

In [12]:
import nltk

nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)

In [14]:
df_train["Clean_Description"] = df_train["Clean_Description"].map(remove_stopwords)
df_val["Clean_Description"] = df_val["Clean_Description"].map(remove_stopwords)
df_test["Clean_Description"] = df_test["Clean_Description"].map(remove_stopwords)

In [15]:
import string

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [16]:
df_train["Clean_Description"] = df_train["Clean_Description"].map(lambda x: remove_punct(x))
df_val["Clean_Description"] = df_val["Clean_Description"].map(lambda x: remove_punct(x))
df_test["Clean_Description"] = df_test["Clean_Description"].map(lambda x: remove_punct(x))

#### 3.2 Create Corpus

In [17]:
from keras.layers import *

In [18]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [19]:
from nltk.tokenize import word_tokenize
import nltk


def create_corpus_tk(df):
    corpus = []
    for text in df["Clean_Description"]:
        words = [word.lower() for word in word_tokenize(text)]
        corpus.append(words)
    return corpus

In [20]:
df_T = df_train.append(df_val)
df_T.reset_index(drop=True,inplace=True)

In [21]:
corpus = create_corpus_tk(df_T)

In [22]:
num_words = len(corpus)
print(num_words)

72684


In [23]:
corpus[0]

['enjoy',
 'chatting',
 'social',
 'video',
 'chat',
 'face',
 'chat',
 'face',
 'chat',
 'opens',
 'new',
 'doors',
 'video',
 'chatting',
 'people',
 'start',
 'chatting',
 'text',
 'audio',
 'video',
 'options',
 'anytime',
 'anywhere',
 'novel',
 'fun',
 'features',
 'waiting',
 'discover',
 'enjoy',
 'time',
 'face',
 'chat',
 'now',
 'face',
 'chat',
 'chat',
 'people',
 'social',
 'friends',
 'real',
 'time',
 'translation',
 'private',
 'secure',
 'video',
 'chat',
 'quick',
 'call',
 'easy',
 'login',
 'instant',
 'messages',
 'im',
 'beauty',
 'effect',
 'vip',
 'membership',
 'benefits',
 'unlock',
 'text',
 'photos',
 'audio',
 'messages',
 'enjoy',
 'app',
 'without',
 'ads',
 'recommended',
 'users',
 'vip',
 'member',
 'unlock',
 'instant',
 'video',
 'audio',
 'calls',
 'feature',
 'video',
 'audio',
 'chat',
 'users',
 'purchasing',
 'gems',
 'gems',
 'benefits',
 'gems',
 'used',
 'video',
 'audio',
 'call',
 'gems',
 'used',
 'buy',
 'different',
 'gifts',
 'privacy'

##4. Train Validation Split

In [None]:
# split the data into a training set and a validation set

In [24]:
train_inputs = df_train["Clean_Description"]
validation_inputs = df_val["Clean_Description"]
train_labels = df_train[target_list]
validation_labels = df_val[target_list]

## 5. Tokenization

In [25]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_inputs)

In [26]:
MAX_SEQUENCE_LENGTH = 600

In [27]:
train_sequences = tokenizer.texts_to_sequences(train_inputs)

In [28]:
train_inputs

0        enjoy chatting social video chat face chat fac...
1        running counter calorie counter app way perfec...
2        fake mask neon angel one fake mask neon angel ...
3        live wallpaper app shirokuma days  set app hom...
4        control airsoft gun smartphone tablet android ...
                               ...                        
58142    free messages sms messages instant messaging a...
58143    best online study app competitive exams rasrps...
58144    anime aesthetic wallpapers cool app brings bes...
58145    report corruption report corruption initiative...
58146    speak app designed aid visually impaired other...
Name: Clean_Description, Length: 58147, dtype: object

In [29]:
train_sequences

[[46,
  1303,
  122,
  36,
  165,
  429,
  165,
  429,
  165,
  4217,
  9,
  5320,
  36,
  1303,
  113,
  119,
  1303,
  50,
  241,
  36,
  294,
  511,
  289,
  5665,
  79,
  10,
  720,
  512,
  46,
  12,
  429,
  165,
  207,
  429,
  165,
  165,
  113,
  122,
  45,
  111,
  12,
  360,
  365,
  262,
  36,
  165,
  239,
  84,
  22,
  969,
  529,
  175,
  2968,
  765,
  448,
  3825,
  2158,
  833,
  532,
  50,
  106,
  241,
  175,
  46,
  1,
  101,
  327,
  1368,
  94,
  3825,
  1339,
  532,
  529,
  36,
  241,
  344,
  124,
  36,
  241,
  165,
  94,
  3147,
  4814,
  4814,
  833,
  4814,
  95,
  36,
  241,
  84,
  4814,
  95,
  462,
  53,
  1691,
  179,
  535,
  26371,
  41,
  2092,
  54,
  200,
  627,
  38,
  140,
  6737,
  5208,
  2601,
  54,
  4815,
  17626,
  15471,
  7072,
  11863,
  113,
  1279,
  8763],
 [680,
  1071,
  2482,
  1071,
  1,
  66,
  199,
  323,
  1293,
  766,
  530,
  595,
  453,
  680,
  2482,
  1071,
  2,
  680,
  1,
  54,
  505,
  41,
  418,
  2330,
  323,
  530,

In [30]:
train_padded = pad_sequences(
    train_sequences, maxlen=MAX_SEQUENCE_LENGTH, truncating="post", padding="post"
)

In [31]:
train_padded

array([[   46,  1303,   122, ...,     0,     0,     0],
       [  680,  1071,  2482, ...,     0,     0,     0],
       [  894,  2379,   596, ...,     0,     0,     0],
       ...,
       [  540,  2187,    13, ...,     0,     0,     0],
       [  747, 14131,   747, ...,     0,     0,     0],
       [  655,     1,   154, ...,     0,     0,     0]], dtype=int32)

In [32]:
validation_sequences = tokenizer.texts_to_sequences(validation_inputs)
validation_padded = pad_sequences(
    validation_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [33]:
validation_padded

array([[   49,    33,  1802, ...,     0,     0,     0],
       [14732,   427,  1030, ...,     0,     0,     0],
       [ 3604,    78,  2682, ...,     0,     0,     0],
       ...,
       [67733,   261,   474, ...,     0,     0,     0],
       [  170,   224, 25455, ...,     0,     0,     0],
       [   25,    91,  1508, ...,     0,     0,     0]], dtype=int32)

In [34]:
print(df_train.Clean_Description[0])
print(train_sequences[0])

enjoy chatting social video chat face chat face chat opens new doors video chatting people start chatting text audio video options anytime anywhere novel fun features waiting discover enjoy time face chat now face chat chat people social friends real time translation private secure video chat quick call easy login instant messages im beauty effect vip membership benefits unlock text photos audio messages enjoy app without ads recommended users vip member unlock instant video audio calls feature video audio chat users purchasing gems gems benefits gems used video audio call gems used buy different gifts privacy policy facechat need responsible information provide party please take caution delivering sensitive information sexual pornographic nude behaviors forbidden people rules banned
[46, 1303, 122, 36, 165, 429, 165, 429, 165, 4217, 9, 5320, 36, 1303, 113, 119, 1303, 50, 241, 36, 294, 511, 289, 5665, 79, 10, 720, 512, 46, 12, 429, 165, 207, 429, 165, 165, 113, 122, 45, 111, 12, 360, 3

In [35]:
word_index = tokenizer.word_index
print("Number of unique words:", len(word_index))

Number of unique words: 141114


In [36]:
word_index

{'app': 1,
 'free': 2,
 'use': 3,
 'keyboard': 4,
 'phone': 5,
 'wallpaper': 6,
 'theme': 7,
 'application': 8,
 'new': 9,
 'features': 10,
 'get': 11,
 'time': 12,
 'wallpapers': 13,
 'screen': 14,
 'also': 15,
 'best': 16,
 'like': 17,
 'one': 18,
 'make': 19,
 'download': 20,
 'android': 21,
 'easy': 22,
 'us': 23,
 'english': 24,
 'live': 25,
 'find': 26,
 'share': 27,
 'launcher': 28,
 'help': 29,
 'mobile': 30,
 'learn': 31,
 'love': 32,
 'apps': 33,
 'set': 34,
 'device': 35,
 'video': 36,
 'want': 37,
 'please': 38,
 'using': 39,
 'hd': 40,
 'need': 41,
 'support': 42,
 'game': 43,
 'home': 44,
 'friends': 45,
 'enjoy': 46,
 'images': 47,
 'simple': 48,
 'many': 49,
 'text': 50,
 'themes': 51,
 'language': 52,
 'different': 53,
 'information': 54,
 'games': 55,
 'available': 56,
 'day': 57,
 'data': 58,
 'access': 59,
 'beautiful': 60,
 'play': 61,
 'choose': 62,
 'easily': 63,
 'add': 64,
 'lock': 65,
 'way': 66,
 'create': 67,
 'you': 68,
 'every': 69,
 'words': 70,
 'save': 

In [37]:
word_index["reason"]

2433

In [38]:
print(validation_sequences[0])

[49, 33, 1802, 1818, 857, 15, 2146, 872, 313, 1, 1074, 53, 345, 1198, 872, 408, 122, 603, 74, 1, 1074, 53, 131, 63, 598, 1, 1, 146, 1, 1067, 75, 1, 49, 10, 1067, 63, 598, 52, 1067, 15, 96, 49, 518, 313, 15, 27, 49, 283, 45, 1, 49, 63128, 58, 228, 7747, 2094, 58, 22, 598, 1, 872, 122, 603, 1818, 663, 1, 474, 146, 1, 22, 3, 21, 30, 1, 42, 21, 107, 1067, 22, 96, 559, 1, 474, 2, 1, 1067, 15, 96, 49, 857, 122, 603, 978, 453, 1818, 313, 872, 408, 29, 1109, 785, 465, 69, 663, 2, 96, 1, 46, 2278, 10, 26, 2934, 489, 674, 2189, 22, 380, 294, 88, 2586, 534, 30, 5, 1326, 636, 1807, 857, 1067, 63, 4513, 2934, 49, 674, 1818, 313, 22, 2253, 674, 677, 1067, 63, 96, 663, 1, 1067, 4489, 96, 559, 1, 49, 489, 56, 1, 489, 836, 2462, 228, 489, 6293, 489, 1380, 10066, 489, 14667, 12884, 489, 2383, 1645, 840, 489, 113, 16749, 489, 38669, 630, 489, 50101, 3495, 3925, 755, 489, 888, 10646, 5755, 489, 812, 11477, 16304, 489, 1420, 92, 1118, 2329, 489, 441, 1232, 2235, 489, 371, 4458, 7747, 489, 427, 3628, 489, 1

In [39]:
word_index["listen"]

422

## 6. Create the Embedding dictionary

In [40]:
embedding_dict = {}
with open("/content/drive/MyDrive/glove6B/glove.6B.300d.txt", "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
f.close()

In [41]:
embedding_dict

Output hidden; open in https://colab.research.google.com to view.

In [42]:
embedding_dim = 300

In [43]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [44]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.73247999, -0.078309  ,  0.018521  , ...,  0.21988   ,
        -0.13121   ,  0.043819  ],
       [-0.30414   , -0.37029999, -0.15881   , ..., -0.37830999,
        -0.42910001,  0.0030023 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.50379997, -0.096152  , -0.13287   , ...,  0.49906999,
        -0.53661001,  0.42375001]])

In [45]:
word_index["reason"]

2433

In [46]:
embedding_dict.get("reason")

array([ 1.3197e-01, -1.2591e-01,  4.3864e-02,  3.6321e-02,  9.6646e-02,
       -1.3829e-01,  3.8637e-01,  7.6962e-02, -1.1306e-01, -1.6083e+00,
        2.0062e-02, -5.2665e-02, -1.6597e-01,  1.2171e-01,  2.8945e-01,
       -1.7289e-01,  5.3035e-02, -2.7842e-01,  8.2376e-02, -1.1980e-02,
        3.7228e-02,  2.1867e-01,  1.5267e-01, -8.4361e-02, -3.1292e-01,
       -3.2093e-02,  2.0281e-01, -3.5910e-01,  1.6873e-02, -2.2996e-01,
        2.6044e-02,  3.5910e-01, -3.2431e-01, -5.4194e-01, -9.7742e-01,
        3.9198e-02, -1.7794e-01,  7.4200e-02, -4.1251e-02, -7.8917e-02,
        2.0646e-01, -6.6538e-02,  6.7401e-02,  1.4965e-01,  5.9107e-02,
       -3.7585e-02, -3.4672e-02,  5.5291e-02, -8.5636e-02,  9.1743e-02,
        4.9125e-01,  7.5606e-03, -3.0860e-01,  5.8902e-04, -8.6975e-02,
        3.9904e-01, -1.2695e-01,  2.2471e-01,  2.3658e-01,  3.0489e-01,
       -6.7363e-02,  3.5839e-01,  4.9703e-01,  4.1895e-01, -3.8494e-01,
       -2.6257e-01,  1.6049e-01, -1.0992e-01,  2.7477e-02,  1.49

In [47]:
(embedding_matrix[2363] == embedding_dict.get("reason")).all()

False

In [48]:
print(train_padded.shape)
print(train_labels.shape)

(58147, 600)
(58147, 9)


In [49]:
print(validation_padded.shape)
print(validation_labels.shape)

(14537, 600)
(14537, 9)


## 7. Build CNN Model

In [50]:
num_permissions = 9 #11
drop = 0.2

batch_size = 32
max_train_epochs = 300
validation_split = 6

early_stopping_patience = 16
early_stopping_delta = 0.02  # 2%

max_description_embeddings = 600
embedding_dim = 300  # +1 for flag

#downloaded_embedding_file = data_folder + "/word_embeddings/glove.6B.300d.txt"


conv_filters_num = 1024
conv_filters_sizes = [1, 2, 3]
dense_layers = [5000, 2500]
dropout = 0.2

heatmap_threshold = 0.49

In [51]:
# Function for class weights

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer


def generate_class_weights(class_series, multi_class=True, one_hot_encoded=False):
  """
  Method to generate class weights given a set of multi-class or multi-label labels, both one-hot-encoded or not.
  Some examples of different formats of class_series and their outputs are:
    - generate_class_weights(['mango', 'lemon', 'banana', 'mango'], multi_class=True, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 1.3333333333333333, 'mango': 0.6666666666666666}
    - generate_class_weights([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]], multi_class=True, one_hot_encoded=True)
    {0: 0.6666666666666666, 1: 1.3333333333333333, 2: 1.3333333333333333}
    - generate_class_weights([['mango', 'lemon'], ['mango'], ['lemon', 'banana'], ['lemon']], multi_class=False, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 0.4444444444444444, 'mango': 0.6666666666666666}
    - generate_class_weights([[0, 1, 1], [0, 0, 1], [1, 1, 0], [0, 1, 0]], multi_class=False, one_hot_encoded=True)
    {0: 1.3333333333333333, 1: 0.4444444444444444, 2: 0.6666666666666666}
  The output is a dictionary in the format { class_label: class_weight }. In case the input is one hot encoded, the class_label would be index
  of appareance of the label when the dataset was processed. 
  In multi_class this is np.unique(class_series) and in multi-label np.unique(np.concatenate(class_series)).
  Author: Angel Igareta (angel@igareta.com)
  """
  if multi_class:
    # If class is one hot encoded, transform to categorical labels to use compute_class_weight   
    if one_hot_encoded:
      class_series = np.argmax(class_series, axis=1)
  
    # Compute class weights with sklearn method
    class_labels = np.unique(class_series)
    class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=class_series)

    print(class_series)
    print(class_labels)

    return dict(zip(class_labels, class_weights))
  else:
    # It is neccessary that the multi-label values are one-hot encoded
    mlb = None
    if not one_hot_encoded:
      mlb = MultiLabelBinarizer()
      class_series = mlb.fit_transform(class_series)

    n_samples = len(class_series)
    n_classes = len(class_series[0])
    print(n_samples)
    print(n_classes)

    # Count each class frequency
    class_count = [0] * n_classes
    for classes in class_series:
        for index in range(n_classes):
            if classes[index] != 0:
                class_count[index] += 1
    
    # Compute class weights using balanced method
    class_weights = [n_samples / (n_classes * freq) if freq > 0 else 1 for freq in class_count]
    class_labels = range(len(class_weights)) if mlb is None else mlb.classes_
    return dict(zip(class_labels, class_weights))
    #return class_weights

In [52]:
class_series = np.array(train_labels)
class_wt = generate_class_weights(class_series, multi_class=False, one_hot_encoded=True)
print(class_wt)
#class_wt = torch.tensor(class_wt)
#print(class_wt)

58147
9
{0: 0.7016483251279081, 1: 0.696354578333453, 2: 1.1984377254271523, 3: 1.0125023942607394, 4: 0.22722015114925012, 5: 0.7171470504803839, 6: 22.669395711500975, 7: 35.89320987654321, 8: 6.266515788339261}


In [53]:
def model_multiconv_1d(num_permissions):
    #embedding_dim = embedding_dim
    sequence_length = max_description_embeddings

    input_layer = Input(shape=(None,))

    conv_layers = []
    for filter_size in conv_filters_sizes:
        conv_layer_i = Embedding(num_words,
                                 output_dim=embedding_dim,
                                 input_length=sequence_length,
                                 weights=[embedding_matrix],
                                 trainable=False)(input_layer)
        conv_layer_i = Conv1D(filters=conv_filters_num,
                              kernel_size=filter_size,
                              padding='same',
                              activation='relu')(conv_layer_i)
        conv_layer_i = GlobalMaxPooling1D()(conv_layer_i)

        conv_layers.append(conv_layer_i)

    if len(conv_layers) == 1:
        previous_layer = conv_layers[0]
    else:
        concatenated_layer = concatenate(conv_layers, axis=-1)
        previous_layer = concatenated_layer

    for n_neurons in dense_layers:
        previous_layer = Dense(n_neurons, activation='relu')(previous_layer)
        previous_layer = Dropout(dropout)(previous_layer)

    output_layer = Dense(num_permissions, activation='sigmoid')(previous_layer)

    return keras.Model(inputs=input_layer, outputs=output_layer)

In [54]:
model = model_multiconv_1d(num_permissions)

In [55]:
model.compile(loss="binary_crossentropy", optimizer=Adam(0.0001), metrics=['accuracy'])
              
#metrics=[metrics.fb_micro, metrics.fb_macro, metrics.precision, metrics.recall])
train_metric = 'val_fb_macro'

In [56]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 300)    42334500    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 300)    42334500    ['input_1[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, None, 300)    42334500    ['input_1[0][0]']                
                                                                                              

In [57]:
checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')

model.fit(train_padded, train_labels, 
          batch_size=batch_size, epochs=10, class_weight=class_wt,
          verbose=1, callbacks=[checkpoint], validation_data=(validation_padded, validation_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6eafb7e5d0>

## 8. Performance on Test Set

In [58]:
test_labels = df_test[target_list]
test_input = df_test["Clean_Description"]

In [59]:
test_sequences = tokenizer.texts_to_sequences(test_input)
test_padded = pad_sequences(
    test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [60]:
print(df_test.Clean_Description[0])
print(test_sequences[0])

poetry lover find best poetry poets urdu problem solved app app contains time best urdu poetry famous urdu poets offline urdu poetry collection best urdu poetry famous urdu poets like mohsin naqvi mirza ghalib saghir siddiqui others also contains romantic urdu poetry sad urdu poetry urdu ghazals offline urdu shayari urdu shayari app contains best ever poetry famous poets times app contains best variety urdu poetry categories romantic poetry sad poetry attitude poetry ghazals inspirational poetry main features offline poetry collection gallery view poetry collection zooming feature save share poetry images app contains offline poetry urdu ghazals urdu poetry poetry famous poets urdu shayari much more
[2956, 1897, 26, 16, 2956, 10498, 421, 677, 2377, 1, 1, 210, 12, 16, 421, 2956, 1021, 421, 10498, 146, 421, 2956, 133, 16, 421, 2956, 1021, 421, 10498, 17, 34985, 40703, 21588, 19918, 30568, 345, 15, 210, 550, 421, 2956, 1565, 421, 2956, 421, 10757, 146, 421, 693, 421, 693, 1, 210, 16, 505,

In [61]:
print(test_labels)

      Camera  Location  Microphone  Contacts  Storage  Phone  SMS  Call_Log  \
0          0         0           0         0        1      0    0         0   
1          0         0           0         0        1      0    0         0   
2          0         0           0         0        1      0    0         0   
3          0         0           0         0        0      0    0         0   
4          1         0           0         0        0      1    0         0   
...      ...       ...         ...       ...      ...    ...  ...       ...   
4619       0         1           0         0        0      0    0         0   
4620       0         0           0         0        0      0    0         0   
4621       0         0           0         0        1      0    0         0   
4622       0         1           1         0        1      0    0         0   
4623       0         0           0         0        0      0    0         0   

      Calendar  
0            0  
1            0  


In [62]:
print(test_padded.shape)
print(test_labels.shape)

(4624, 600)
(4624, 9)


In [63]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

predictions = model.predict(test_padded)
print("predictions shape:", predictions.shape)

predictions shape: (4624, 9)


In [64]:
predictions[0:10]

array([[5.7202824e-03, 1.8037152e-02, 2.0407684e-05, 5.0457144e-05,
        2.7416474e-01, 9.2067011e-04, 1.3020542e-06, 2.8210216e-09,
        1.1649060e-05],
       [1.6500520e-06, 2.0397125e-02, 2.3450746e-06, 6.6695031e-07,
        3.2727772e-03, 6.9467648e-04, 3.3186120e-07, 1.1034722e-10,
        1.8096065e-05],
       [9.4358402e-01, 8.6665398e-01, 2.7437830e-01, 2.9938197e-01,
        8.3176017e-01, 8.4606487e-01, 1.8744588e-02, 9.6273078e-03,
        3.4061873e-01],
       [2.5189717e-04, 9.2492945e-02, 1.4261232e-06, 3.0694690e-02,
        5.0026917e-01, 3.7909558e-01, 9.9650832e-05, 5.4986263e-06,
        2.0249673e-04],
       [1.2782381e-01, 2.4899852e-02, 5.2688946e-04, 1.7006861e-03,
        7.3480415e-03, 2.4223243e-01, 1.4420967e-03, 4.2021198e-05,
        2.5583715e-03],
       [8.9605087e-01, 6.3421600e-03, 1.7767450e-03, 1.4800742e-01,
        9.9195975e-01, 6.3049041e-02, 1.5946837e-04, 8.8739862e-06,
        1.7677834e-04],
       [5.0271577e-01, 8.7843484e-01, 2.

In [65]:
true_labels = test_labels.to_numpy()
true_labels[0:10]

array([[0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0]])

In [66]:
np.save("predictions.npy", predictions)
loaded_predictions = np.load("predictions.npy")
print(loaded_predictions)

[[5.7202824e-03 1.8037152e-02 2.0407684e-05 ... 1.3020542e-06
  2.8210216e-09 1.1649060e-05]
 [1.6500520e-06 2.0397125e-02 2.3450746e-06 ... 3.3186120e-07
  1.1034722e-10 1.8096065e-05]
 [9.4358402e-01 8.6665398e-01 2.7437830e-01 ... 1.8744588e-02
  9.6273078e-03 3.4061873e-01]
 ...
 [3.4470248e-01 8.5247403e-01 5.0140461e-03 ... 7.0642233e-03
  2.0822354e-03 2.0842005e-02]
 [6.3644242e-05 9.9998331e-01 9.9969113e-01 ... 9.3254718e-12
  8.8345650e-13 2.1768876e-03]
 [5.5180550e-05 1.8961234e-04 1.0235621e-07 ... 4.2422831e-05
  8.5925969e-09 8.6435175e-06]]


In [67]:
np.save("true_labels.npy", true_labels)
loaded_true_labels = np.load("true_labels.npy")
print(loaded_true_labels)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## 9. Threshold Calculation

In [68]:
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for data simulation
from sklearn.datasets import make_classification     # Create a synthetic dataframe
from sklearn.linear_model import LogisticRegression  # Classification model
from sklearn.model_selection import train_test_split # Split the dataframe
from sklearn.metrics import roc_curve                # Calculate the ROC curve
from sklearn.metrics import precision_recall_curve   # Calculate the Precision-Recall curve
from sklearn.metrics import f1_score                 # Calculate the F-score
# Import module for data visualization
from plotnine import *
import plotnine

In [69]:
thresh_f = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
thresh_roc = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [70]:
n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("predictions.npy")
  true_labels = np.load("true_labels.npy")

  # Array for finding the optimal threshold
  thresholds = np.arange(0.0, 1.0, 0.0001)
  fscore = np.zeros(shape=(len(thresholds)))
  print('Length of sequence: {}'.format(len(thresholds)))

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Fit the model
  for index, elem in enumerate(thresholds):
    # Corrected probabilities
    y_pred_prob = (pred > elem).astype('int')
    # Calculate the f-score
    fscore[index] = f1_score(labels, y_pred_prob)

  # Find the optimal threshold
  index = np.argmax(fscore)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  fscoreOpt = round(fscore[index], ndigits = 4)
  thresh_f[i] = thresholdOpt
  print('Best Threshold: {} with F-Score: {}'.format(thresholdOpt, fscoreOpt))

print("-------------------------------------")
print("optimal threshold tuning for f-score")
print(thresh_f)

permission  0
Length of sequence: 10000
Best Threshold: 0.6865 with F-Score: 0.6804
permission  1
Length of sequence: 10000
Best Threshold: 0.7485 with F-Score: 0.5867
permission  2
Length of sequence: 10000
Best Threshold: 0.288 with F-Score: 0.5795
permission  3
Length of sequence: 10000
Best Threshold: 0.1202 with F-Score: 0.5213
permission  4
Length of sequence: 10000
Best Threshold: 0.1626 with F-Score: 0.7981
permission  5
Length of sequence: 10000
Best Threshold: 0.4912 with F-Score: 0.5339
permission  6
Length of sequence: 10000
Best Threshold: 0.0646 with F-Score: 0.1026
permission  7
Length of sequence: 10000
Best Threshold: 0.3957 with F-Score: 0.4444
permission  8
Length of sequence: 10000
Best Threshold: 0.2119 with F-Score: 0.3575
-------------------------------------
optimal threshold tuning for f-score
[0.6865, 0.7485, 0.288, 0.1202, 0.1626, 0.4912, 0.0646, 0.3957, 0.2119]


In [71]:
##for roc curve with g-mean

n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("predictions.npy")
  true_labels = np.load("true_labels.npy")

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Create the ROC curve
  fpr, tpr, thresholds = roc_curve(labels, pred)

  df_fpr_tpr = pd.DataFrame({'FPR':fpr, 'TPR':tpr, 'Threshold':thresholds})

  # Calculate the G-mean
  gmean = np.sqrt(tpr * (1 - fpr))

  # Find the optimal threshold
  index = np.argmax(gmean)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  gmeanOpt = round(gmean[index], ndigits = 4)
  fprOpt = round(fpr[index], ndigits = 4)
  tprOpt = round(tpr[index], ndigits = 4)

  thresh_roc[i] = thresholdOpt
  print('Best Threshold: {} with G-Mean: {}'.format(thresholdOpt, gmeanOpt))
  print('FPR: {}, TPR: {}'.format(fprOpt, tprOpt))

print("-------------------------------------")
print("ROC curve with G-mean threshold tuning")
print(thresh_roc)

permission  0
Best Threshold: 0.11699999868869781 with G-Mean: 0.8418
FPR: 0.1513, TPR: 0.8349
permission  1
Best Threshold: 0.1145000010728836 with G-Mean: 0.7678
FPR: 0.2065, TPR: 0.743
permission  2
Best Threshold: 0.002899999963119626 with G-Mean: 0.8172
FPR: 0.1585, TPR: 0.7936
permission  3
Best Threshold: 0.01269999984651804 with G-Mean: 0.7889
FPR: 0.2316, TPR: 0.81
permission  4
Best Threshold: 0.39660000801086426 with G-Mean: 0.7822
FPR: 0.2192, TPR: 0.7835
permission  5
Best Threshold: 0.18170000612735748 with G-Mean: 0.7637
FPR: 0.1788, TPR: 0.7101
permission  6
Best Threshold: 0.004399999976158142 with G-Mean: 0.9072
FPR: 0.0947, TPR: 0.9091
permission  7
Best Threshold: 0.0005000000237487257 with G-Mean: 0.9442
FPR: 0.1085, TPR: 1.0
permission  8
Best Threshold: 0.004900000058114529 with G-Mean: 0.8314
FPR: 0.2025, TPR: 0.8667
-------------------------------------
ROC curve with G-mean threshold tuning
[0.117, 0.1145, 0.0029, 0.0127, 0.3966, 0.1817, 0.0044, 0.0005, 0.0049

## 10. Performance Score

In [None]:
#Fscore micro for different thresholds-

In [72]:
#predictions = np.load("predictions.npy")
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [73]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import f1_score

def f_at_1(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_f[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_f[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_f[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_f[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_f[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_f[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_f[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_f[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_f[8]

   
    acc[0] = f1_score(labels[:, 0], preds_th[:, 0])
    acc[1] = f1_score(labels[:, 1], preds_th[:, 1])
    acc[2] = f1_score(labels[:, 2], preds_th[:, 2])
    acc[3] = f1_score(labels[:, 3], preds_th[:, 3])
    acc[4] = f1_score(labels[:, 4], preds_th[:, 4])
    acc[5] = f1_score(labels[:, 5], preds_th[:, 5])
    acc[6] = f1_score(labels[:, 6], preds_th[:, 6])
    acc[7] = f1_score(labels[:, 7], preds_th[:, 7])
    acc[8] = f1_score(labels[:, 8], preds_th[:, 8])

   #f1_score_micro = metrics.f1_score(targets, outputs, average='micro')

    #print(acc)
    return acc

In [74]:
eval_accuracy = f_at_1(predictions, true_labels)

np.save("F1_CV2_N72k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.
print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")
avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 score: {0:.4f}".format(avg_score))

  Camera    : 0.6804
  Location  : 0.5867
  Microphone: 0.5795
  Contacts  : 0.5213
  Storage   : 0.7981
  Phone     : 0.5339
  SMS       : 0.1026
  Call_Log  : 0.4444
  Calendar  : 0.3575

  Average F1 score: 0.5116


In [None]:
#Fscore micro for different thresholds-

In [75]:
#predictions = np.load("predictions.npy")
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [76]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import f1_score

def f1micro_accuracy(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_f[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_f[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_f[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_f[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_f[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_f[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_f[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_f[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_f[8]

    acc[0] = f1_score(labels[:, 0], preds_th[:, 0], average='micro')
    acc[1] = f1_score(labels[:, 1], preds_th[:, 1], average='micro')
    acc[2] = f1_score(labels[:, 2], preds_th[:, 2], average='micro')
    acc[3] = f1_score(labels[:, 3], preds_th[:, 3], average='micro')
    acc[4] = f1_score(labels[:, 4], preds_th[:, 4], average='micro')
    acc[5] = f1_score(labels[:, 5], preds_th[:, 5], average='micro')
    acc[6] = f1_score(labels[:, 6], preds_th[:, 6], average='micro')
    acc[7] = f1_score(labels[:, 7], preds_th[:, 7], average='micro')
    acc[8] = f1_score(labels[:, 8], preds_th[:, 8], average='micro')
    
   #f1_score_micro = metrics.f1_score(targets, outputs, average='micro')

    #print(acc)
    return acc

In [77]:
eval_accuracy = f1micro_accuracy(predictions, true_labels)

np.save("F1Mic_CV2_N72k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 (micro) score: {0:.4f}".format(avg_score))

  Camera    : 0.8994
  Location  : 0.8778
  Microphone: 0.9325
  Contacts  : 0.8685
  Storage   : 0.7757
  Phone     : 0.8573
  SMS       : 0.9849
  Call_Log  : 0.9989
  Calendar  : 0.9712

  Average F1 (micro) score: 0.9074


In [None]:
#roc-auc score for different thresholds-

In [78]:
import numpy as np
predictions = np.load("predictions.npy")
true_labels = np.load("true_labels.npy")

In [79]:
# Function to calculate the accuracy of our predictions vs labels

import numpy as np
from sklearn.metrics import roc_auc_score

def roc_auc(preds, labels):
    #print('my_print_2')
    acc = [0, 0 ,0 ,0 ,0 ,0, 0, 0, 0]
    preds_th = preds
    
    preds_th[:, 0] = np.array(preds[:, 0]) >= thresh_roc[0]
    preds_th[:, 1] = np.array(preds[:, 1]) >= thresh_roc[1]
    preds_th[:, 2] = np.array(preds[:, 2]) >= thresh_roc[2]
    preds_th[:, 3] = np.array(preds[:, 3]) >= thresh_roc[3]
    preds_th[:, 4] = np.array(preds[:, 4]) >= thresh_roc[4]
    preds_th[:, 5] = np.array(preds[:, 5]) >= thresh_roc[5]
    preds_th[:, 6] = np.array(preds[:, 6]) >= thresh_roc[6]
    preds_th[:, 7] = np.array(preds[:, 7]) >= thresh_roc[7]
    preds_th[:, 8] = np.array(preds[:, 8]) >= thresh_roc[8]


    acc[0] = roc_auc_score(labels[:, 0], preds_th[:, 0])
    acc[1] = roc_auc_score(labels[:, 1], preds_th[:, 1])
    acc[2] = roc_auc_score(labels[:, 2], preds_th[:, 2])
    acc[3] = roc_auc_score(labels[:, 3], preds_th[:, 3])
    acc[4] = roc_auc_score(labels[:, 4], preds_th[:, 4])
    acc[5] = roc_auc_score(labels[:, 5], preds_th[:, 5])
    acc[6] = roc_auc_score(labels[:, 6], preds_th[:, 6])
    acc[7] = roc_auc_score(labels[:, 7], preds_th[:, 7])
    acc[8] = roc_auc_score(labels[:, 8], preds_th[:, 8])
   

    #print(acc)
    return acc

In [80]:
#roc-auc score

eval_accuracy = roc_auc(predictions, true_labels)

np.save("ROC_CV2_N72k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average ROC_AUC score: {0:.4f}".format(avg_score))

  Camera    : 0.8418
  Location  : 0.7683
  Microphone: 0.8164
  Contacts  : 0.7882
  Storage   : 0.7820
  Phone     : 0.7657
  SMS       : 0.9072
  Call_Log  : 0.8632
  Calendar  : 0.8318

  Average ROC_AUC score: 0.8183


# ACNET performance

#### a) Data Preparation

In [81]:
## drive path
acnet_path = "/content/drive/MyDrive/MetadataCSV/acnet_dataset_preprocess.csv"

In [82]:
df_acnet = pd.read_csv(acnet_path) 
print(df_acnet.shape)

(1417, 14)


In [83]:
df_acnet = df_acnet.dropna(subset=['Clean_Description'])

In [84]:
df_acnet.head()

Unnamed: 0,app_id,description,Storage,Contacts,Location,Camera,Microphone,SMS,Call_Log,Phone,Calendar,Settings,Tasks,Clean_Description
0,0,ROOT is REQUIRED for automatic synchronization...,1,0,1,0,0,0,0,0,0,0,0,root is required for automatic synchronization...
1,1,This app delivers short scriptures containing ...,0,0,0,0,0,1,0,0,0,0,0,this app delivers short scriptures containing ...
2,2,This game is surprisingly simple and very addi...,0,0,0,0,0,0,0,0,0,0,0,this game is surprisingly simple and very addi...
3,3,It is an online RPG game based on LBS location...,0,0,1,0,0,0,0,0,0,0,0,it is an online rpg game based on lbs location...
4,4,Christmas is in the air. Get yourself in the h...,0,1,0,0,0,0,0,0,0,1,1,christmas is in the air. get yourself in the h...


In [85]:
df_acnet["Clean_Description"] = df_acnet["Clean_Description"].map(remove_stopwords)

In [86]:
df_acnet["Clean_Description"] = df_acnet["Clean_Description"].map(lambda x: remove_punct(x))

In [87]:
acnet_labels = df_acnet[target_list]
acnet_input = df_acnet["Clean_Description"]

In [88]:
acnet_sequences = tokenizer.texts_to_sequences(acnet_input)
acnet_padded = pad_sequences(
    acnet_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

In [89]:
print(acnet_padded.shape)
print(acnet_labels.shape)

(1414, 600)
(1414, 9)


#### b) Get Predictions

In [90]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

predictions = model.predict(acnet_padded)
print("predictions shape:", predictions.shape)

predictions shape: (1414, 9)


In [91]:
acnet_labels = acnet_labels.to_numpy()

In [92]:
np.save("acnet_predictions.npy", predictions)
loaded_predictions = np.load("acnet_predictions.npy")

np.save("acnet_labels.npy", acnet_labels)
loaded_true_labels = np.load("acnet_labels.npy")

#### c) Threshold Calculation

In [93]:
thresh_f = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
thresh_roc = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [94]:
n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("acnet_predictions.npy")
  true_labels = np.load("acnet_labels.npy")

  # Array for finding the optimal threshold
  thresholds = np.arange(0.0, 1.0, 0.0001)
  fscore = np.zeros(shape=(len(thresholds)))
  print('Length of sequence: {}'.format(len(thresholds)))

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Fit the model
  for index, elem in enumerate(thresholds):
    # Corrected probabilities
    y_pred_prob = (pred > elem).astype('int')
    # Calculate the f-score
    fscore[index] = f1_score(labels, y_pred_prob)

  # Find the optimal threshold
  index = np.argmax(fscore)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  fscoreOpt = round(fscore[index], ndigits = 4)
  thresh_f[i] = thresholdOpt
  print('Best Threshold: {} with F-Score: {}'.format(thresholdOpt, fscoreOpt))

print("-------------------------------------")
print("optimal threshold tuning for f-score")
print(thresh_f)

permission  0
Length of sequence: 10000
Best Threshold: 0.533 with F-Score: 0.4803
permission  1
Length of sequence: 10000
Best Threshold: 0.7207 with F-Score: 0.5537
permission  2
Length of sequence: 10000
Best Threshold: 0.0607 with F-Score: 0.5215
permission  3
Length of sequence: 10000
Best Threshold: 0.0677 with F-Score: 0.5934
permission  4
Length of sequence: 10000
Best Threshold: 0.008 with F-Score: 0.5911
permission  5
Length of sequence: 10000
Best Threshold: 0.853 with F-Score: 0.4
permission  6
Length of sequence: 10000
Best Threshold: 0.0103 with F-Score: 0.568
permission  7
Length of sequence: 10000
Best Threshold: 0.023 with F-Score: 0.3744
permission  8
Length of sequence: 10000
Best Threshold: 0.2478 with F-Score: 0.5263
-------------------------------------
optimal threshold tuning for f-score
[0.533, 0.7207, 0.0607, 0.0677, 0.008, 0.853, 0.0103, 0.023, 0.2478]


In [95]:
##for roc curve with g-mean

n = 9
for i in range(0, n):
  print('permission ',i)

  predictions = np.load("acnet_predictions.npy")
  true_labels = np.load("acnet_labels.npy")

  labels = true_labels[:, i]
  pred = predictions[:, i]

  # Create the ROC curve
  fpr, tpr, thresholds = roc_curve(labels, pred)

  df_fpr_tpr = pd.DataFrame({'FPR':fpr, 'TPR':tpr, 'Threshold':thresholds})

  # Calculate the G-mean
  gmean = np.sqrt(tpr * (1 - fpr))

  # Find the optimal threshold
  index = np.argmax(gmean)
  thresholdOpt = round(thresholds[index], ndigits = 4)
  gmeanOpt = round(gmean[index], ndigits = 4)
  fprOpt = round(fpr[index], ndigits = 4)
  tprOpt = round(tpr[index], ndigits = 4)

  thresh_roc[i] = thresholdOpt
  print('Best Threshold: {} with G-Mean: {}'.format(thresholdOpt, gmeanOpt))
  print('FPR: {}, TPR: {}'.format(fprOpt, tprOpt))

print("-------------------------------------")
print("ROC curve with G-mean threshold tuning")
print(thresh_roc)

permission  0
Best Threshold: 0.3248000144958496 with G-Mean: 0.7131
FPR: 0.2236, TPR: 0.655
permission  1
Best Threshold: 0.515500009059906 with G-Mean: 0.7231
FPR: 0.2207, TPR: 0.6711
permission  2
Best Threshold: 0.020400000736117363 with G-Mean: 0.7798
FPR: 0.1509, TPR: 0.7162
permission  3
Best Threshold: 0.06539999693632126 with G-Mean: 0.704
FPR: 0.2789, TPR: 0.6874
permission  4
Best Threshold: 0.4374000132083893 with G-Mean: 0.5674
FPR: 0.4648, TPR: 0.6014
permission  5
Best Threshold: 0.6812000274658203 with G-Mean: 0.7727
FPR: 0.18, TPR: 0.7281
permission  6
Best Threshold: 0.0017999999690800905 with G-Mean: 0.7953
FPR: 0.2712, TPR: 0.8679
permission  7
Best Threshold: 0.0003000000142492354 with G-Mean: 0.738
FPR: 0.2838, TPR: 0.7604
permission  8
Best Threshold: 0.04780000075697899 with G-Mean: 0.828
FPR: 0.1611, TPR: 0.8173
-------------------------------------
ROC curve with G-mean threshold tuning
[0.3248, 0.5155, 0.0204, 0.0654, 0.4374, 0.6812, 0.0018, 0.0003, 0.0478]


#### d) Accuracy Score

In [None]:
#Fscore micro for different thresholds-

In [96]:
#predictions = np.load("predictions.npy")
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [97]:
eval_accuracy = f_at_1(predictions, true_labels)

np.save("ACNET_F1_CV2_N72k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.
print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")
avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 score: {0:.4f}".format(avg_score))

  Camera    : 0.4803
  Location  : 0.5537
  Microphone: 0.5215
  Contacts  : 0.5934
  Storage   : 0.5911
  Phone     : 0.4000
  SMS       : 0.5680
  Call_Log  : 0.3744
  Calendar  : 0.5263

  Average F1 score: 0.5121


In [None]:
#Fscore micro for different thresholds-

In [98]:
#predictions = np.load("predictions.npy")
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [99]:
eval_accuracy = f1micro_accuracy(predictions, true_labels)

np.save("ACNET_F1Mic_CV2_N72k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average F1 (micro) score: {0:.4f}".format(avg_score))

  Camera    : 0.7949
  Location  : 0.7914
  Microphone: 0.8741
  Contacts  : 0.7122
  Storage   : 0.4668
  Phone     : 0.8685
  SMS       : 0.8451
  Call_Log  : 0.9031
  Calendar  : 0.9300

  Average F1 (micro) score: 0.7984


In [None]:
#roc-auc score for different thresholds-

In [100]:
import numpy as np
predictions = np.load("acnet_predictions.npy")
true_labels = np.load("acnet_labels.npy")

In [101]:
#roc-auc score

eval_accuracy = roc_auc(predictions, true_labels)

np.save("ACNET_ROC_CV2_N72k_TCNN.npy", eval_accuracy)

# Report the final accuracy for this validation run.

print("  Camera    : {0:.4f}".format(eval_accuracy[0]))
print("  Location  : {0:.4f}".format(eval_accuracy[1]))
print("  Microphone: {0:.4f}".format(eval_accuracy[2]))
print("  Contacts  : {0:.4f}".format(eval_accuracy[3]))
print("  Storage   : {0:.4f}".format(eval_accuracy[4]))
print("  Phone     : {0:.4f}".format(eval_accuracy[5]))
print("  SMS       : {0:.4f}".format(eval_accuracy[6]))
print("  Call_Log  : {0:.4f}".format(eval_accuracy[7]))
print("  Calendar  : {0:.4f}".format(eval_accuracy[8]))

print("")

avg_score = (np.sum(eval_accuracy, dtype = np.float32)) / 9
print("  Average ROC_AUC score: {0:.4f}".format(avg_score))

  Camera    : 0.7135
  Location  : 0.7235
  Microphone: 0.7827
  Contacts  : 0.7031
  Storage   : 0.5683
  Phone     : 0.7696
  SMS       : 0.7960
  Call_Log  : 0.7357
  Calendar  : 0.8233

  Average ROC_AUC score: 0.7351
