In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
FILENAME = 'data/honeypot_dataset.csv'

In [None]:
ALL_FEATURES = ['ruri',
                'ruri_user',
                'ruri_domain',
                'from_user',
                'from_domain',
                'from_tag',
                'to_user',
                'contact_user',
                'callid',
                'content_type',
                'user_agent',
                'source_ip',
                'source_port',
                'destination_port',
                'contact_ip',
                'contact_port']
CATEGORICAL = ['ruri',
               'ruri_user',
               'ruri_domain',
               'from_user',
               'from_domain',
               'from_tag',
               'to_user',
               'contact_user',
               'callid',
               'content_type',
               'user_agent',
               'source_ip',
               'contact_ip']

In [None]:
CONTINUOUS = ['source_port', 'destination_port', 'contact_port']
DROPPED_FEATURES = ['destination_port', 'ruri', 'ruri_domain', 'from_domain', 'callid', 'from_tag', 'content_type']
FEATURES = list(set(ALL_FEATURES) - set(DROPPED_FEATURES))
LABEL = 'toll_fraud'

In [None]:
FEATURES

In [None]:
dataset = pd.read_csv(FILENAME)

In [None]:
dataset.drop(DROPPED_FEATURES, axis=1)
na_vars = {"contact_user": "unknown"}
dataset.fillna(na_vars)

In [None]:
SIP_SCANNERS = ('sipcli/v1.8', 'pplsip')
dataset['is_scanner'] = 0
dataset['is_scanner'].loc[dataset['user_agent'].isin(SIP_SCANNERS)] = 1

In [None]:
dataset.head(100)


In [None]:
dataset = dataset.drop(DROPPED_FEATURES, axis=1)


In [None]:
dataset.dtypes

In [None]:
dataset['ruri_user'] = dataset['ruri_user'].astype('category')


In [None]:
dataset.dtypes

In [None]:
dataset['ruri_user_cat'] = dataset['ruri_user'].cat.codes

In [None]:
dataset.head()

In [None]:
def encode_one_hot(df, column, axis=1):
    """

    :param df: (Pandas.dataframe) A Pandas dataframe.
    :param column: (str) Column name.
    :param axis: (int). Pandas.dataframe axis
    :return:
    """
    return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)

In [None]:
class HoneypotData(object):
    """Honeypot Data class.

    This class will contain the entire data pipeline from raw data to prepared
    numpy arrays.  It's eventually inherited by the model class, but is left
    distinct for readbility and logical organization.
    """
    
    filepath = 'data/'
    train_fn = 'honeypot_dataset.csv'
    test_fn = 'honeypot_test.csv'

    def __init__(self):
        """ Initializes and process all pipeline."""
        self.all_dataset = None
        self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()

    def preproc(self):
        """Process data pipeline"""

        # Import Data & Drop irrevelant features
        dataset = self.import_data(self.train_fn)
        # Fix NA values.
        dataset = self.fix_na(dataset)

        # Create dummies.
        dataset = encode_one_hot(dataset, 'ruri_user')
        dataset = encode_one_hot(dataset, 'from_user')
        dataset = encode_one_hot(dataset, 'from_domain')
        dataset = encode_one_hot(dataset, 'to_user')
        dataset = encode_one_hot(dataset, 'contact_user')
        dataset = encode_one_hot(dataset, 'user_agent')
        dataset = encode_one_hot(dataset, 'source_ip')
        dataset = encode_one_hot(dataset, 'contact_ip')

        # Select all columns except Target.
        X = dataset[dataset.columns.difference([LABEL])]
        y = dataset[LABEL]
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=606, stratify=y)
        return X_train.astype('float32'), y_train.values, X_valid.astype('float32'), y_valid.values

    def import_data(self, filename):
        """Import that data and then split it into train/test sets. Make sure to stratify.

        This stratify parameter makes a split so that the proportion of values in the sample produced will be the same
        as the proportion of values provided to parameter stratify.
        For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros
        and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.
        """

        dataset = pd.read_csv('%s%s' % (self.filepath, filename))
        self.all_dataset = dataset
        # Drop irrelevant features.
        return dataset.drop(DROPPED_FEATURES, axis=1)

    def fix_na(self, data):
        """Fill na's with test (in the case of contact_user), and with application/sdp in the case of content_type."""

        na_vars = {"contact_user": "test", "content_type": "application/sdp"}
        return data.fillna(na_vars)

    def preproc_test(self):
        """Preprocess testing data."""

        test = self.import_data(self.test_fn)
        # Extract labels.
        labels = test.user_agent.values
        # Fix NA values.
        test = self.fix_na(test)

        # Create dummy variables.
        test = encode_one_hot(test, 'ruri_user')
        test = encode_one_hot(test, 'from_user')
        test = encode_one_hot(test, 'from_domain')
        test = encode_one_hot(test, 'to_user')
        test = encode_one_hot(test, 'contact_user')
        test = encode_one_hot(test, 'user_agent')
        test = encode_one_hot(test, 'source_ip')
        test = encode_one_hot(test, 'contact_ip')
        return labels, test


In [None]:
class HoneypotKeras(HoneypotData):
    """Main classifier model based in Keras."""

    def __init__(self):
        self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()
        self.y_train, self.y_valid = to_categorical(self.y_train), to_categorical(self.y_valid)
        self.feature_count = self.X_train.shape[1]
        self.history = []

    def build_model(self):
        model = Sequential()
        model.add(Dense(2056, input_shape=(self.feature_count,), activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(1028, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(1028, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.4))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        self.model = model

    def fit(self, lr=0.001, epochs=1):
        self.model.optimizer.lr = lr
        hist = self.model.fit(self.X_train, self.y_train,
                              batch_size=32, epochs=epochs,
                              verbose=1, validation_data=(self.X_valid, self.y_valid),
                              )
        self.history.append(hist)

    def prepare_submission(self, name):
        labels, test_data = self.preproc_test()
        predictions = self.model.predict(test_data)
        subm = pd.DataFrame(np.column_stack([labels, np.around(predictions[:, 1])]).astype('int32'),
                            columns=['user_agent', 'toll_fraud'])
        subm.to_csv("{}.csv".format(name), index=False)
        return subm

In [None]:
model = HoneypotKeras()

In [None]:
model.build_model()

In [None]:
model.fit(lr=0.01, epochs=1)

In [None]:
labels, test_data = model.preproc_test()

In [None]:
model.all_dataset['user_agent']
model.all_dataset['is_scanner'] = 1 #initialize to yes/1 is scanner

In [None]:
model.all_dataset['is_scanner'].loc[model.all_dataset['user_agent'].isin(SIP_SCANNERS)] = 0

In [None]:
model.all_dataset.describe().transpose()

In [None]:
model.all_dataset['is_scanner'].values

In [None]:
list(test_data.columns.values)

In [None]:
SIP_SCANNERS = ('sipcli/v1.8', 'pplsip')
def engineer_features(dataset):
        dataset['is_scanner'] = 1 #initialize to yes/1 is scanner
        dataset['is_scanner'].loc[dataset['user_agent'].isin(SIP_SCANNERS)] = 0  # the rest are 0
        return dataset

In [None]:
list(model.X_train['user_agent'].values)