# <font style="color:red;">Visualisation of Hybrid Apps Dataset</font>

## <font style="color:blue;">Basic Initialization</font>

In [None]:
#Installing mandatory libraries

#!pip install geonamescache
#!pip install palettable
#!pip install -U textblob
#!pip install cufflinks
#!pip install seaborn
#!pip install plotly
#!pip install -c plotly plotly-orca

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
# Common imports
import pandas as pd
import numpy as np
import time
import os
import sklearn
import seaborn as sns
import warnings
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from random import randrange
#Disabling Warnings
warnings.filterwarnings('ignore')
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
sns.set_palette(['green','red'])#Fixing the Seaborn default palette

## <font style="color:blue;">Loading Dataset</font>

In [None]:
#Verifying pathname of dataset before loading
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename));
        print(os.listdir("../input"))

In [None]:
# Load Datasets
def loadDataset(file_name):
    df = pd.read_csv(file_name)
    return df

df = loadDataset("/kaggle/input/hybrid-apps-security-analysis/HybridAppsDataset.csv")
#Ensuring correct sequence of columns 
df = df[["app_hash","webview_tab","js_enabled","js_inf_defined","acc_sys_call",
                "obf_js_permit","inf_droid_Code_obf","out_url","gsafe_brow","https",
                "js_input_val", "web_redirect","js_inf_len","label"]]


## <font style="color:blue;">Details of Dataset: Tabular</font>

### <I>The Dataset (Training Dataset comprising of 78,767 records) is shown below in tablular form. Please note the 13 Attributes/Features in the dataset. The last attribute is the Class Label, with categorical values 'benign' and 'malicious' for Benign and Malicious Hybrid Apps respectively.</I>

In [None]:
df

## <font style="color:blue;">Analysis of Class Label & its Imbalance</font>

### <I>The Class Label for this dataset is given in the last column. It has two values- 'benign' and 'malicious' corresponding to Benign and Malicious Hybrid Apps respectively. On the Google Play Store, Malicious Hybrid Apps are few compared to Benign Apps. This inequality shows in our dataset as well, since it has been scraped from Google Play store or its mirrors. The Class Label and its inequality is visualised and analysed below in detail.</I>

In [None]:
# Class Distribution of Labels
df.groupby('label').size()

In [None]:
# Analysis of Postives and Negatives in the Dataset
pos,neg = df['label'].value_counts()
total = neg + pos
print ('Total of Samples: %s'% total)
print('Positive: {} ({:.2f}% of total)'.format(pos, 100 * pos / total))
print('Negative: {} ({:.2f}% of total)'.format(neg, 100 * neg / total))

In [None]:
# Bar Plot of Malicious and Benign Websites
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

fig = plt.figure(figsize = (12,4))
#title = fig.suptitle("Plot of Malicious and Benign Hybrid Apps", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)
#Bar Plot
ax1 = fig.add_subplot(1,2,1)
ax1.set_xlabel("Class Labels",fontsize=18)
ax1.set_ylabel("Number of Hybrid Apps",fontsize=18)
ticklabels = ax1.get_xticklabels() + ax1.get_yticklabels()
for label in ticklabels:
    label.set_fontsize('x-large') 
#ax1.title.set_text('BAR PLOT: MALICIOUS & BENIGN Hybrid Apps', fontsize=20)
labels = df['label'].value_counts()
w = (list(labels.index), list(labels.values))
ax1.tick_params(axis='both', which='major')
bar = ax1.bar(['Benign','Malicious'], w[1], color=['green','red'], edgecolor='black', linewidth=1)
#Stacked Plot 
ax2 = fig.add_subplot(1,2,2)
ax2.title.set_text('Stack Plot: Malicious & Benign Hybrid Apps')
# create dummy variable then group by that set the legend to false because we'll fix it later
df.assign(dummy = 1).groupby(['dummy','label']).size().groupby(level=0).apply(
    lambda x: 100 * x / x.sum()).to_frame().unstack().plot(kind='bar',stacked=True,legend=False,ax=ax2,color={'red','green'}, linewidth=0.50, ec='k')
ax2.set_xlabel('Benign/Malicious Hybrid Apps')# or it'll show up as 'dummy' 
ax2.set_xticks([])# disable ticks in the x axis
current_handles, _ = plt.gca().get_legend_handles_labels()#Fixing Legend
reversed_handles = reversed(current_handles)
correct_labels = reversed(['Malicious','Benign'])
plt.legend(reversed_handles,correct_labels)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
#Saving the Figs
figc = plt.gcf()
plt.tight_layout()
#figc.savefig("imgs/hv1/Fig01&02: Bar Plot & Stack Plot of Malicious & Benign Hybrid Apps.png")
extent = ax1.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig01: Bar Plot of Class Labels.png",bbox_inches=extent.expanded(1.5, 1.4))
extent = ax2.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig02: Stack Plot of Class Labels.png",bbox_inches=extent.expanded(1.5, 1.4))

In [None]:
# Pie Chart of Malicious and Benign Hybrid Apps Distribution
fig = plt.figure(figsize = (14,5))
Explode = [0,0.1]
plt.pie(w[1],explode=Explode,labels=w[0],shadow=False,startangle=45,
        colors=['green','red'],autopct='%.2f%%',textprops={'fontsize': 15})
plt.axis('equal')
plt.legend(title='Class Labels of Hybrid Apps',loc='lower right')
#fig.savefig('imgs/hv1/Fig03:Pie Chart Distribution of Class Labels.png')
plt.show()

### <I>As can be seen from the visualisations above, this dataset has significant class imbalance. Hence, during any machine learning process, adequate measures will have to be undertaken to handle or compensate this imbalance in order to get accurate results</I>

## <font style="color:blue;">Analysis of 'webview_tab'  Attribute</font>

In [None]:
# Multi-bar Plot of 'webview_tab' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="webview_tab", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig04: WebviewTabPlot.png")

## <font style="color:blue;">Analysis of 'JavaScript_Enabled'  Attribute</font>

In [None]:
# Multi-bar Plot of 'js_enabled' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="js_enabled", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig05: js_enabledPlot.png")

## <font style="color:blue;">Analysis of 'js_inf_defined'  Attribute</font>

In [None]:
# Multi-bar Plot of 'js_inf_defined' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="js_inf_defined", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig06: js_inf_definedPlot.png")

## <font style="color:blue;">Analysis of 'acc_sys_call'  Attribute</font>

In [None]:
# Multi-bar Plot of 'acc_sys_call' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="acc_sys_call", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig07: acc_sys_callPlot.png")

## <font style="color:blue;">Analysis of 'obf_js_permit'  Attribute</font>

In [None]:
# Multi-bar Plot of 'obf_js_permit' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="obf_js_permit", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig09: obf_js_permitPlot.png")

## <font style="color:blue;">Analysis of 'inf_droid_Code_obf'  Attribute</font>

In [None]:
# Multi-bar Plot of 'inf_droid_Code_obf' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="inf_droid_Code_obf", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig10: inf_droid_Code_obfPlot.png")

## <font style="color:blue;">Analysis of 'out_url'  Attribute</font>

In [None]:
# Multi-bar Plot of 'out_url' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="out_url", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig11: out_urlPlot.png")

## <font style="color:blue;">Analysis of 'gsafe_brow'  Attribute</font>

In [None]:
# Multi-bar Plot of 'gsafe_brow' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="gsafe_brow", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig12: gsafe_browPlot.png")

## <font style="color:blue;">Analysis of 'https'  Attribute</font>

In [None]:
# Multi-bar Plot of 'https' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="https", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig13: httpsPlot.png")

## <font style="color:blue;">Analysis of 'js_input_val'  Attribute</font>

In [None]:
# Multi-bar Plot of 'js_input_val' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="js_input_val", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig14: js_input_valPlot.png")

## <font style="color:blue;">Analysis of 'web_redirect'  Attribute</font>

In [None]:
# Multi-bar Plot of 'web_redirect' attribute
fig= plt.figure(figsize = (6,4))
cp = sns.countplot(x="web_redirect", hue="label", data=df, 
                   palette={"benign": "green", "malicious": "red"})
#fig.savefig("imgs/hv1/Fig15: web_redirectPlot.png")

## <font style="color:blue;">Analysis of 'js_inf_len'  Attribute</font>

In [None]:
# js_inf_len analysis vis-a-vis malicious and benign Hybrid Apps
df_bad=df.loc[df['label']=='malicious']
df_good=df.loc[df['label']=='benign']
list_subplots = [] #List of subplots for replotting in different combinations
# Histogram of js_inf_len: Malicious  
fig = plt.figure(figsize =(10,10))
title = fig.suptitle("JS Interface Android Code Length Distributioins: Malicious vs Benign Hybrid Apps")
fig.subplots_adjust(wspace=0.3,hspace=0.3)
ax0 = fig.add_subplot(3,2,1)
ax0.set_xlabel("Length:Malicious Apps")
ax0.set_ylabel("Frequency") 
ax0.text(70, 1200, r'$\mu$='+str(round(df_bad['js_inf_len'].mean(),2)), fontsize=12)
freq, bins, patches = ax0.hist(df_bad['js_inf_len'], color='red', bins=15, edgecolor='black', linewidth=1)
                                    
# Density Plot of js_inf_len: Malicious Hybrid Apps
ax01 = fig.add_subplot(3,2,2)
ax01.set_xlabel("Length:Malicious Apps")
ax01.set_ylabel("Frequency") 
sns.kdeplot(df_bad['js_inf_len'], ax=ax01, shade=True, color='red')

# Histogram of js_inf_len: Benign Apps 
ax02 = fig.add_subplot(3,2,3)
ax02.set_xlabel("Length:Benign Apps")
ax02.set_ylabel("Frequency") 
ax02.text(70, 100000, r'$\mu$='+str(round(df_good['js_inf_len'].mean(),2)), fontsize=12)
freq, bins, patches = ax02.hist(df_good['js_inf_len'], color='green', bins=15, edgecolor='black', linewidth=1)
                                    
# Density Plot of js_inf_len: Benign Webpages
ax03 = fig.add_subplot(3,2,4)
ax03.set_xlabel("Length:Benign Apps")
ax03.set_ylabel("Frequency") 
sns.kdeplot(df_good['js_inf_len'], ax=ax03, shade=True, color='green')

#Combined Plot of Malicious & Benign Apps using Histogram
ax04 = fig.add_subplot(3,2,5)
ax04.set_ylabel("Frequency") 
g = sns.FacetGrid(df, hue='label', palette={"benign": "g", "malicious": "r"})
g.map(sns.distplot, 'js_inf_len', kde=False, bins=15, ax=ax04)
ax04.legend(prop={'size':10})
plt.tight_layout()

# Violin Plots of 'js_inf_len'
ax05 = fig.add_subplot(3,2,6)
sns.violinplot(x="label", y="js_inf_len", data=df, ax=ax05)
ax05.set_xlabel("Violin Plot:Length vs Labels",size = 12,alpha=0.8)
ax05.set_ylabel("Lenght of JS Interface Android Code",size = 12,alpha=0.8)
#Saving the relevant sublplots for subsequent plotting
list_subplots.extend([ax0,ax02,ax04,ax05])
#Saving the Figs
figc = fig
#figc.savefig("imgs/hv1/Fig16-21: All Plots- Length Univariate Analysis.png")
extent = ax0.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig16:Length Histogram Malicious.png",bbox_inches=extent.expanded(1.6, 1.5))
extent = ax01.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig17:Length Density Plot Malicious.png",bbox_inches=extent.expanded(1.6, 1.5))
extent = ax02.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig18:Length Histogram Benign.png",bbox_inches=extent.expanded(1.6, 1.5))
extent = ax03.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig19:Length Density Plot Benign.png",bbox_inches=extent.expanded(1.6, 1.5))
extent = ax04.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig20:Length Histogram-Benign & Malicious.png",bbox_inches=extent.expanded(1.6, 1.5))
extent = ax05.get_window_extent().transformed(figc.dpi_scale_trans.inverted())
#figc.savefig("imgs/hv1/Fig21:Length Violin Plot-Benign & Malicious.png",bbox_inches=extent.expanded(1.6, 1.5))
plt.close()

### <I>As can be seen from above plots of 'js_inf_len, average URL length of malicious webpages is more than benign webpages. Hence, it is a strong predictor for malicious behavior </I>

### <I>The statistical values of js_inf_len</I>

In [None]:
#Statistical Values of all three numerical Columns
df.describe()

In [None]:
#Statistical Values of js_inf_len: Segregated Based on Class Labels
df_good= df.loc[df['label']=='benign']
df_bad= df.loc[df['label']=='malicious']
subset_attributes = ['js_inf_len']
g = round(df_good[subset_attributes].describe(),2)
b = round(df_bad[subset_attributes].describe(),2)
pd.concat([g,b], axis=1, keys=['Benign Apps Statistics', 'Malicious Apps Statistics'])

### <I>Please see the distinction that emerges, in the table above for the two class labels.</I>

In [None]:
# Visualizing 3-D numeric data with Scatter Plots
# length, breadth and depth
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
title = fig.suptitle("3D Trivariate Analysis: 'js_inf_len','js_inf_defined' & 'acc_sys_call'")
xs = df.iloc[:,]['acc_sys_call']
ys = df.iloc[:,]['js_inf_defined']
zs = df.iloc[:,]['js_inf_len']
ax.scatter(xs, ys, zs, s=50, alpha=0.6, edgecolors='w',color='purple')
ax.set_xlabel('acc_sys_call')
ax.set_ylabel('js_inf_defined')
ax.set_zlabel('js_inf_len')
#fig.savefig("imgs/hv1/Fig22: 3D Scatter Trivariate Analysis.png")

In [None]:
# Scatter Plot with Hue for visualising data in 3-D
cols = ['js_inf_len','label']
pp = sns.pairplot(df[cols], hue='label', size=1.8, aspect=1.8, 
                  palette={"benign": "green", "malicious": "red"},
                  plot_kws=dict(edgecolor="black", linewidth=0.5))
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Numerical Attributes : Pairwise Plot for both Malicious & Benign Apps', fontsize=14)
#fig.savefig("imgs/hv1/Fig23: Scatter Plot-Trivariate Analysis.png")