In [None]:
import pandas as pd

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [None]:
FILENAME = 'data/honeypot_dataset.csv'

In [None]:
EPOCHS = 30
BATCH_SIZE = 32
RANDOM_STATE = 1337
NUM_FOLDS = 10

In [None]:
dataset = pd.read_csv(FILENAME)

In [None]:
dataset.columns

In [None]:
dataset.dtypes

In [None]:
dataset.describe()

In [None]:
dataset['source_ip'].value_counts()

In [None]:
dataset.columns.to_series().groupby(dataset.dtypes).groups

In [None]:
COLUMNS = ['ruri', 'ruri_user', 'ruri_domain', 'from_user', 'from_domain', 'from_tag', 'to_user', 'contact_user',
           'callid', 'content_type', 'user_agent',
           'source_ip', 'source_port', 'destination_port', 'contact_ip', 'contact_port']
CATEGORICAL = ['ruri', 'ruri_user', 'ruri_domain', 'from_user', 'from_domain', 'from_tag', 'to_user', 'contact_user',
               'callid', 'content_type', 'user_agent',
               'source_ip', 'contact_ip']
CONTINUOUS = ['source_port', 'destination_port', 'contact_port']
LABEL = 'valid'

In [None]:
COLUMNS

In [None]:
def encode_one_hot(df, column, axis=1):
    return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)

In [None]:
df_train = dataset.sample(frac=0.8, random_state=50)

In [None]:
df_test = dataset.drop(df_train.index)

In [None]:
dataset = pd.concat([df_train.drop(LABEL, axis=1), df_test], ignore_index=True)

In [None]:
dataset = dataset.drop(['ruri','ruri_domain', 'callid', 'from_tag', 'content_type'], axis=1)

In [None]:
dataset.info()

In [None]:
dataset = encode_one_hot(dataset, 'ruri_user')
dataset = encode_one_hot(dataset, 'from_user')
dataset = encode_one_hot(dataset, 'from_domain')
dataset = encode_one_hot(dataset, 'to_user')
dataset = encode_one_hot(dataset, 'contact_user')
dataset = encode_one_hot(dataset, 'user_agent')
dataset = encode_one_hot(dataset, 'source_ip')
dataset = encode_one_hot(dataset, 'contact_ip')

In [None]:
dataset.describe().transpose()

In [None]:
train_count = len(df_train)

In [None]:
feature_count = dataset.shape[1]

In [None]:
print('Number of features:', feature_count)

In [None]:
train_count

In [None]:
x_submit = dataset[train_count:]
x_train = dataset[:train_count]
y_train = df_train[LABEL]

In [None]:
x_submit.info()

In [None]:
def build_model():
    m = Sequential([
        Dense(30, activation='relu', input_dim=feature_count, kernel_initializer='random_uniform'),
        Dense(30, activation='relu'),
        Dense(1, activation='sigmoid'),  # TODO 1x sigmoid vs 2x softmax?
    ])
    m.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return m

In [None]:
model = KerasClassifier(build_fn=build_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=False)

In [None]:
cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_STATE)

In [None]:
results = cross_val_score(model, x_train, y_train, cv=cv, n_jobs=-1)

In [None]:
print('Mean accuracy in %i-fold CV:' % NUM_FOLDS, results.mean())

In [None]:
model = build_model()

In [None]:
model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=False)

In [None]:
y_pred = model.predict_classes(x_train, verbose=True).flatten()

In [None]:
print('Final accuracy on training data:', accuracy_score(y_train, y_pred))

In [None]:
print(pd.crosstab(y_train, y_pred, rownames=['Real'], colnames=['Predicted'], margins=True))

In [None]:
row_filter = [y1 == y2 for (y1, y2) in zip(y_pred, y_train)]

In [None]:
call_fraud = df_train.copy()

In [None]:
call_fraud['valid_caller'] = y_pred

In [None]:
call_fraud = call_fraud[row_filter]

In [None]:
call_fraud.to_csv('call_fraud.csv', index=False)

In [None]:
print('Wrote', len(call_fraud), 'rows to call_fraud.csv')

In [None]:
y_submit = model.predict_classes(x_submit, verbose=False).flatten()

In [None]:
df_submit = pd.DataFrame(y_submit, index=df_test['user_agent'], columns=[LABEL])

In [None]:
df_submit.to_csv('submission.csv')

In [None]:
print('Wrote', len(df_submit), 'rows to submission.csv')