In [1]:
import pandas as pd
import numpy as np
import altair as alt
import json
from IPython.display import HTML
from  altair.vega import v5
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.offline as py
import plotly.tools as tls
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go
import palettable
init_notebook_mode(connected=True)  
plt.style.use('ggplot')

import plotly.figure_factory as ff

# Driver functions:

In [2]:
##-----------------------------------------------------------
# This whole section 
vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v5.SCHEMA_VERSION
vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
noext = "?noext"

paths = {
    'vega': vega_url + noext,
    'vega-lib': vega_lib_url + noext,
    'vega-lite': vega_lite_url + noext,
    'vega-embed': vega_embed_url + noext
}

workaround = """
requirejs.config({{
    baseUrl: 'https://cdn.jsdelivr.net/npm/',
    paths: {}
}});
"""

#------------------------------------------------ Defs for future rendering
def add_autoincrement(render_func):
    # Keep track of unique <div/> IDs
    cache = {}
    def wrapped(chart, id="vega-chart", autoincrement=True):
        if autoincrement:
            if id in cache:
                counter = 1 + cache[id]
                cache[id] = counter
            else:
                cache[id] = 0
            actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
        else:
            if id not in cache:
                cache[id] = 0
            actual_id = id
        return render_func(chart, id=actual_id)
    # Cache will stay outside and 
    return wrapped
            
@add_autoincrement
def render(chart, id="vega-chart"):
    chart_str = """
    <div id="{id}"></div><script>
    require(["vega-embed"], function(vg_embed) {{
        const spec = {chart};     
        vg_embed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
        console.log("anything?");
    }});
    console.log("really...anything?");
    </script>
    """
    return HTML(
        chart_str.format(
            id=id,
            chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
        )
    )



HTML("".join((
    "<script>",
    workaround.format(json.dumps(paths)),
    "</script>")))

In [3]:

def word_cloud(df, pixwidth=720, pixheight=450, column="index", counts="count"):
    data= [dict(name="dataset", values=df.to_dict(orient="records"))]
    wordcloud = {
        "$schema": "https://vega.github.io/schema/vega/v5.json",
        "width": pixwidth,
        "height": pixheight,
        "padding": 0,
        "title": "Hover to see number of occureances from all the sequences",
        "data": data
    }
    scale = dict(
        name="color",
        type="ordinal",
        range=["cadetblue", "royalblue", "steelblue", "navy", "teal"]
    )
    mark = {
        "type":"text",
        "from":dict(data="dataset"),
        "encode":dict(
            enter=dict(
                text=dict(field=column),
                align=dict(value="center"),
                baseline=dict(value="alphabetic"),
                fill=dict(scale="color", field=column),
                tooltip=dict(signal="datum.count + ' occurrances'")
            )
        ),
        "transform": [{
            "type": "wordcloud",
            "text": dict(field=column),
            "size": [pixwidth, pixheight],
            "font": "Helvetica Neue, Arial",
            "fontSize": dict(field="datum.{}".format(counts)),
            "fontSizeRange": [10, 60],
            "padding": 2
        }]
    }
    wordcloud["scales"] = [scale]
    wordcloud["marks"] = [mark]
    
    return wordcloud



In [4]:
def pie_plot(labels, values, colors, title):
    fig = {
      "data": [
        {
          "values": values,
          "labels": labels,
          "domain": {"x": [0, .48]},
          "name": "Job Type",
          "sort": False,
          "marker": {'colors': colors},
          "textinfo":"percent+label+value",
          "textfont": {'color': '#FFFFFF', 'size': 10},
          "hole": .6,
          "type": "pie"
        } ],
        "layout": {
            "title":title,
            "annotations": [
                {
                    "font": {
                        "size": 25,

                    },
                    "showarrow": False,
                    "text": ""

                }
            ]
        }
    }
    return fig


def distplot(df):
    return ff.create_distplot([df['length']], ['length'], bin_size=10).show()

# Basic data exploration:

In [5]:
df = pd.DataFrame(pd.read_csv('CDMC2019 Task2 Df.csv'))

In [6]:
df.columns=['todrop', 'Commands', 'malware_value']
df.drop('todrop', axis=1, inplace=True)


In [7]:
df.head()

Unnamed: 0,Commands,malware_value
0,execve ioctl ioctl prctl gettimeofday getpid g...,2
1,execve ioctl ioctl time getpid time getpid soc...,2
2,execve ioctl ioctl prctl time getpid time getp...,2
3,execve ioctl ioctl time getpid time getpid soc...,2
4,execve ioctl ioctl prctl time getpid time getp...,2


Adding the length of the words in the dataframe:

In [8]:
df['length'] = df.Commands.apply(lambda x: len(str(x).split()))

In [9]:
set(df.malware_value)

{1, 2, 3, 4, 5}

In [10]:
df.describe()

Unnamed: 0,malware_value,length
count,4167.0,4167.0
mean,1.864171,9301.065515
std,1.008976,17924.08349
min,1.0,51.0
25%,1.0,435.0
50%,2.0,543.0
75%,2.0,11651.0
max,5.0,189123.0


Finding the counts of occurances of all the malware_types

In [11]:
from palettable.colorbrewer.qualitative import Dark2_5

In [12]:
Dark2_5.hex_colors

['#1B9E77', '#D95F02', '#7570B3', '#E7298A', '#66A61E']

In [13]:
value_counts = df['malware_value'].value_counts()
labels = value_counts.index.tolist()
py.iplot(pie_plot(labels, value_counts, ['#1B9E77', '#D95F02', '#7570B3', '#E7298A', '#66A61E'], "Malware Types"))

The counts stand as follows:
- 1: 1541 instances out of 4167
- 2: 2265 instances out of 4167
- 3: 34 instances out of 4167
- 4: 40 instances out of 4167
- 5: 287 instances out of 4167

From the above pie plot, we can see that the data is heavily imbalanced. Solving class imbalance could be a necessary step for getting accurate classifications. 
We could:
- Resample the data to reduce imbalance, either upsampling minority classes of downsampling the majority ones
- Use tensorflow smote method to create more instances for the imbalance

Thus, since the values are in range 1-5, let's build wordclouds for them to find some similarity in the word occurances

# Creating wordclouds for Classification values 1-5:

In [14]:
#Function for wordcloud driver:
from collections import defaultdict
def wordcloud_create(df):
    corpus = df.Commands.values.tolist()
    final = defaultdict(int) #Declaring an empty dictionary for count (Saves ram usage)
    for words in corpus:
        for word in words.split():
             final[word]+=1
                
                
    corpus = pd.Series(final) #Creating a dataframe from the final default dict
    return render(word_cloud(corpus.to_frame(name="count").reset_index(), pixheight=600, pixwidth=900))

## Wordcloud for Malware_value 1

In [15]:
df_1 = df[df.malware_value==1]

wordcloud_create(df_1)

From the given worldcloud, we can observe that the commands have instances as follows:
- recvfrom: 9091549 instances
- sendto: 9083794 instances

Thus, we can form a vague idea that the first malware classification must be for socket tasks which involve the malware to be focused around send and recv of data between them.

## Wordcloud for Malware_value 2

In [16]:
df_2 = df[df.malware_value==2]

wordcloud_create(df_2)

From the given worldcloud, we can observe that the commands have instances as follows:
- getsockopt: 1241292 instances
- newselect: 1253044 instances

Thus, we can form a vague idea that the second malware classification must be for socket tasks which involve the creation of socket and it's underlying conncections. 

## Wordcloud for Malware_value 3

In [17]:
df_3 = df[df.malware_value==3]

wordcloud_create(df_3)

From the given worldcloud, we can observe that the commands have instances as follows:
- clock_gettime: 170988 instances
- gettimeofday: 85807 instances

Thus, we can form a vague idea that the third malware classification must be for  tasks which involve the malware to be focused around time based data. Also, as the number of general instances of the third malware is marginally less than the first and second, it could be considered that the instances are high for extremely less instances of the third malware.

## Wordcloud for Malware_value 4

In [18]:
df_4 = df[df.malware_value==4]

wordcloud_create(df_4)

From the given worldcloud, we can observe that the commands have instances as follows:
- rt_sigaction: 6649 instances
- rt_sigprocmask: 5757 instances

Thus, we can form a vague idea that the fourth malware classification must generally be focused for masking the calling threads or actions on such threads. The number of instances are in range with the overall number of occurances for the fourth malware in the data.

## Wordcloud for Malware_value 5

In [19]:
df_5 = df[df.malware_value==5]

wordcloud_create(df_5)

From the given worldcloud, we can observe that the commands have instances as follows:
- close: 164358 instances
- open: 108164 instances

Thus, we can form a vague idea that the fifth malware classification must be for socket tasks which involve the malware to be focused around the opening and closing of sockets/files, with secondary processes around time of day and masks for the threads.

### To get an even more extensive idea over the wordclouds, plotting the distribution plot for the data based on length of words:

### Malware: 1

In [20]:
df_1 = df[df.malware_value==1]
distplot(df_1)

For the first malware, it can be observed that most of the values lie in the range of 0-24k of words in length, 
with numerous outliers, the maximum going up to 188.7449k in length.

The sheer peaks of the distplot lie around tge range of 0-2.5k

### Malware: 2

In [21]:
df_2 = df[df.malware_value==2]
distplot(df_2)

Although the second malware has the highest number of occurances amongst the rest of the malwares, the general length of words for such would lie around the range of 0-600 words, with some stretching out till 5k, and the outlier having the maximum of 166k. 

### Malware 3

In [22]:
df_3 = df[df.malware_value==3]
distplot(df_3)

The third malware, with the first amongst the less instances, has the range of 5k-27k in general, with the outlier at 45k 

### Malware 4:

In [23]:
df_4 = df[df.malware_value==4]
distplot(df_4)

Malware 4 is perhaps the one with the least amount of words, with it peaking around 52 words, with much higher density around a 1000 words and the outliers around 8k range

### Malware 5:

In [24]:
df_5 = df[df.malware_value==5]
distplot(df_5)

The max occurred value for the fifth malware is around 400 words, with a wider cluster of arrangement between 2-4k words. The outlier stands at 16k words. 
