In [1]:
import pandas as pd

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

Using TensorFlow backend.


In [3]:
FILENAME = 'data/honeypot_dataset.csv'

In [4]:
EPOCHS = 30
BATCH_SIZE = 32
RANDOM_STATE = 1337
NUM_FOLDS = 10
LABEL = 'toll_fraud'

In [5]:
dataset = pd.read_csv(FILENAME)

In [6]:
COLUMNS = ['ruri', 'ruri_user', 'ruri_domain', 'from_user', 'from_domain', 'from_tag', 'to_user', 'contact_user',
           'callid', 'content_type', 'user_agent',
           'source_ip', 'source_port', 'destination_port', 'contact_ip', 'contact_port']
CATEGORICAL = ['ruri', 'ruri_user', 'ruri_domain', 'from_user', 'from_domain', 'from_tag', 'to_user', 'contact_user',
               'callid', 'content_type', 'user_agent',
               'source_ip', 'contact_ip']
CONTINUOUS = ['source_port', 'destination_port', 'contact_port']
DROPPED_FEATURES = ['ruri','ruri_domain', 'callid', 'from_tag', 'content_type']

In [7]:
dataset.columns

Index([u'ruri', u'ruri_user', u'ruri_domain', u'from_user', u'from_domain',
       u'from_tag', u'to_user', u'contact_user', u'callid', u'content_type',
       u'user_agent', u'source_ip', u'source_port', u'destination_port',
       u'contact_ip', u'contact_port', u'toll_fraud'],
      dtype='object')

In [8]:
dataset.dtypes

ruri                object
ruri_user           object
ruri_domain         object
from_user           object
from_domain         object
from_tag            object
to_user             object
contact_user        object
callid              object
content_type        object
user_agent          object
source_ip           object
source_port          int64
destination_port     int64
contact_ip          object
contact_port         int64
toll_fraud           int64
dtype: object

In [None]:
# dataset.from_user.value_counts()

In [None]:
# dataset.isnull().sum()

In [9]:
dataset[LABEL].value_counts()

1    2206
0      14
Name: toll_fraud, dtype: int64

In [10]:
dataset = dataset.drop(DROPPED_FEATURES, axis=1)


In [11]:
dataset.dtypes

ruri_user           object
from_user           object
from_domain         object
to_user             object
contact_user        object
user_agent          object
source_ip           object
source_port          int64
destination_port     int64
contact_ip          object
contact_port         int64
toll_fraud           int64
dtype: object

In [12]:
dataset[LABEL].value_counts()
# dataset.columns.to_series().groupby(dataset.dtypes).groups

1    2206
0      14
Name: toll_fraud, dtype: int64

In [13]:
def encode_one_hot(df, column, axis=1):
    return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)

In [14]:
df_train = dataset.sample(frac=0.8, random_state=50)

In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1776 entries, 2189 to 797
Data columns (total 12 columns):
ruri_user           1776 non-null object
from_user           1776 non-null object
from_domain         1776 non-null object
to_user             1776 non-null object
contact_user        1775 non-null object
user_agent          1776 non-null object
source_ip           1776 non-null object
source_port         1776 non-null int64
destination_port    1776 non-null int64
contact_ip          1776 non-null object
contact_port        1776 non-null int64
toll_fraud          1776 non-null int64
dtypes: int64(4), object(8)
memory usage: 180.4+ KB


In [16]:
df_test = dataset.drop(df_train.index)

In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 444 entries, 0 to 2216
Data columns (total 12 columns):
ruri_user           444 non-null object
from_user           444 non-null object
from_domain         444 non-null object
to_user             444 non-null object
contact_user        444 non-null object
user_agent          444 non-null object
source_ip           444 non-null object
source_port         444 non-null int64
destination_port    444 non-null int64
contact_ip          444 non-null object
contact_port        444 non-null int64
toll_fraud          444 non-null int64
dtypes: int64(4), object(8)
memory usage: 45.1+ KB


In [21]:
dataset = pd.concat([df_train.drop(LABEL, axis=1), df_test], ignore_index=True)

In [22]:
dataset.describe()

Unnamed: 0,contact_port,destination_port,source_port,toll_fraud
count,2220.0,2220.0,2220.0,444.0
mean,29135.528829,5061.801802,32942.348198,0.993243
std,26230.651564,84.895272,24859.318449,0.082014
min,0.0,5060.0,5060.0,0.0
25%,5070.0,5060.0,5072.0,1.0
50%,5076.0,5060.0,42118.5,1.0
75%,56560.0,5060.0,56560.0,1.0
max,65515.0,9060.0,65515.0,1.0


In [26]:
def fix_na(data):
    """Fill na's with test (in the case of contact_user), and with application/sdp in the case of content_type."""
    na_vars = {"contact_user": "test", "content_type": "application/sdp"}
    return data.fillna(na_vars)

In [28]:
dataset = fix_na(dataset)

In [29]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2220 entries, 0 to 2219
Data columns (total 12 columns):
contact_ip          2220 non-null object
contact_port        2220 non-null int64
contact_user        2220 non-null object
destination_port    2220 non-null int64
from_domain         2220 non-null object
from_user           2220 non-null object
ruri_user           2220 non-null object
source_ip           2220 non-null object
source_port         2220 non-null int64
to_user             2220 non-null object
toll_fraud          444 non-null float64
user_agent          2220 non-null object
dtypes: float64(1), int64(3), object(8)
memory usage: 208.2+ KB


In [30]:
dataset = encode_one_hot(dataset, 'ruri_user')
dataset = encode_one_hot(dataset, 'from_user')
dataset = encode_one_hot(dataset, 'from_domain')
dataset = encode_one_hot(dataset, 'to_user')
dataset = encode_one_hot(dataset, 'contact_user')
dataset = encode_one_hot(dataset, 'user_agent')
dataset = encode_one_hot(dataset, 'source_ip')
dataset = encode_one_hot(dataset, 'contact_ip')

In [31]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
contact_port,2220.0,29135.528829,26230.651564,0.0,5070.0,5076.0,56560.0,65515.0
destination_port,2220.0,5061.801802,84.895272,5060.0,5060.0,5060.0,5060.0,9060.0
source_port,2220.0,32942.348198,24859.318449,5060.0,5072.0,42118.5,56560.0,65515.0
toll_fraud,444.0,0.993243,0.082014,0.0,1.0,1.0,1.0,1.0
ruri_user_#972592277524,2220.0,0.000901,0.030008,0.0,0.0,0.0,0.0,1.0
ruri_user_*+31203697460,2220.0,0.002252,0.047415,0.0,0.0,0.0,0.0,1.0
ruri_user_*0031203697460,2220.0,0.002252,0.047415,0.0,0.0,0.0,0.0,1.0
ruri_user_*0048322132924,2220.0,0.000450,0.021224,0.0,0.0,0.0,0.0,1.0
ruri_user_*011972592277524,2220.0,0.000901,0.030008,0.0,0.0,0.0,0.0,1.0
ruri_user_*048322132924,2220.0,0.000450,0.021224,0.0,0.0,0.0,0.0,1.0


In [32]:
train_count = len(df_train)

In [33]:
feature_count = dataset.shape[1]

In [34]:
print('Number of features:', feature_count)

('Number of features:', 2134)


In [None]:
train_count

In [None]:
x_submit = dataset[train_count:]
x_train = dataset[:train_count]
y_train = df_train[LABEL]

In [None]:
x_submit.info()

In [None]:
def build_model():
    m = Sequential([
        Dense(30, activation='relu', input_dim=feature_count, kernel_initializer='random_uniform'),
        Dense(30, activation='relu'),
        Dense(1, activation='sigmoid'),  # TODO 1x sigmoid vs 2x softmax?
    ])
    m.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return m

In [None]:
model = KerasClassifier(build_fn=build_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=False)

In [None]:
cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_STATE)

In [None]:
results = cross_val_score(model, x_train, y_train, cv=cv, n_jobs=-1)

In [None]:
print('Mean accuracy in %i-fold CV:' % NUM_FOLDS, results.mean())

In [None]:
model = build_model()

In [None]:
model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=False)

In [None]:
y_pred = model.predict_classes(x_train, verbose=True).flatten()

In [None]:
print('Final accuracy on training data:', accuracy_score(y_train, y_pred))

In [None]:
print(pd.crosstab(y_train, y_pred, rownames=['Real'], colnames=['Predicted'], margins=True))

In [None]:
row_filter = [y1 == y2 for (y1, y2) in zip(y_pred, y_train)]

In [None]:
call_fraud = df_train.copy()

In [None]:
call_fraud['valid_caller'] = y_pred

In [None]:
call_fraud = call_fraud[row_filter]

In [None]:
call_fraud.to_csv('call_fraud.csv', index=False)

In [None]:
print('Wrote', len(call_fraud), 'rows to call_fraud.csv')

In [None]:
y_submit = model.predict_classes(x_submit, verbose=False).flatten()

In [None]:
df_submit = pd.DataFrame(y_submit, index=df_test['user_agent'], columns=[LABEL])

In [None]:
df_submit.to_csv('submission.csv')

In [None]:
print('Wrote', len(df_submit), 'rows to submission.csv')