In [None]:
import pickle

import pandas as pd

import sklearn.model_selection as sk_ms
import sklearn.metrics as sk_mt
import sklearn.preprocessing as sk_p
import sklearn.utils as sk_ut
import numpy as np
import keras.models as k_mod
import keras.layers as k_lay
import keras.optimizers as k_opt
import keras.utils as k_ut

import utils

In [None]:
with open('cnn_input_ds.pkl', 'rb') as f:
    tweets_ds = pickle.load(f)

In [None]:
users_ds = pd.read_csv('../../dataset/final_dataset/twitter_patterns_with_graph.csv')

In [None]:
user_2_tweets = {}
user_2_stats = {}
user_2_group = {}

for entry in users_ds.values:
    user_2_stats[entry[1]] = entry[2:]
    user_2_group[entry[1]] = entry[0]
    
for entry in zip(tweets_ds['username'], tweets_ds['tweets']):
    user_2_tweets[entry[0]] = entry[1]

In [None]:
lb = sk_p.LabelBinarizer()
lb.fit(list(set(user_2_group.values())))

In [None]:
cnn_x = []
add_x = []
groups = []

for u in user_2_group.keys():
    cnn_x.append(user_2_tweets[u])
    add_x.append(user_2_stats[u])
    groups.append(user_2_group[u])

In [None]:
cnn_x = np.array(cnn_x).astype('float64')
add_x = np.array(add_x).astype('float64')

In [None]:
cnn_input_shape = cnn_x[0].shape
add_input_shape = add_x[0].shape
groups_num = len(set(user_2_group.values()))
test_split = 0.1

In [None]:
cnn_x, add_x, groups = sk_ut.shuffle(cnn_x, add_x, groups)
groups = np.array(groups)
groups = lb.transform(groups)

In [None]:
cnn_x_train, cnn_x_test, add_x_train, add_x_test, y_train, y_test = sk_ms.train_test_split(
    cnn_x, add_x, groups, test_size=test_split, shuffle=True)

In [None]:
cnn_input = k_lay.Input(shape=cnn_input_shape, name='cnn_input')
net_x = k_lay.Conv1D(32, kernel_size=3, activation='sigmoid')(cnn_input)
net_x = k_lay.Conv1D(32, kernel_size=3, activation='sigmoid')(net_x)
net_x = k_lay.Flatten()(net_x)

additional_input = k_lay.Input(shape=add_input_shape, name='additional_input')
net_common = k_lay.merge.Concatenate()([net_x, additional_input])
net_common = k_lay.Dense(64, activation='sigmoid')(net_common)
net_common - k_lay.Dropout(0.5)(net_common)
final_output = k_lay.Dense(groups_num, activation='softmax', name='final_output')(net_common)

model = k_mod.Model(inputs=[cnn_input, additional_input], outputs=[final_output])
model.compile(optimizer=k_opt.Adam(0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
model.fit([cnn_x_train, add_x_train], y_train, epochs=500, batch_size=128, validation_split=test_split)

In [None]:
pred = model.predict([cnn_x_test, add_x_test])
pred_wrap = [np.argmax(p) for p in pred]
y_pred = lb.inverse_transform(k_ut.to_categorical(pred_wrap))
y_test_wrap = lb.inverse_transform(y_test)

In [None]:
print(sk_mt.classification_report(y_test_wrap, y_pred))

In [None]:
utils.print_scores(*utils.score(y_test_wrap, y_pred))

In [None]:
utils.plot_confusion_matrix(y_test_wrap, y_pred, sorted(set(y_test_wrap)))

In [None]:
dump_data = {
    'train_cnn_x': cnn_x_train,
    'test_cnn_x': cnn_x_test,
    'train_add_x': add_x_train,
    'test_add_x': add_x_test,
    'train_groups': y_train,
    'test_groups': y_test
}

with open('cnn_add_state_dataset.pkl', 'wb') as f:
    pickle.dump(dump_data, f)
    
model.save('cnn_add_state_model.h5')