In [27]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

import warnings
warnings.simplefilter('ignore')

In [28]:
social1 = pd.read_csv('social_spambots_1.csv')
social2 = pd.read_csv('social_spambots_2.csv')
social3 = pd.read_csv('social_spambots_3.csv')
traditional1 = pd.read_csv('traditional_spambots_1.csv')
traditional2 = pd.read_csv('traditional_spambots_2.csv') 
fake = pd.read_csv('fake_followers.csv')
genuine = pd.read_csv('genuine_accounts.csv')

In [29]:
requiredColumns = ['created_at', 'updated', 'statuses_count', 'friends_count', 'followers_count', 'favourites_count', 
                   'default_profile', 'geo_enabled']

social1 = social1[requiredColumns]
social2 = social2[requiredColumns]
social3 = social3[requiredColumns]
traditional1 = traditional1[requiredColumns]
traditional2 = traditional2[requiredColumns]
fake = fake[requiredColumns]
genuine = genuine[requiredColumns]

In [30]:
def clean_df(df):
    df['created_at'] = pd.to_datetime(df['created_at'].tolist(),errors='coerce').astype('datetime64[ns]')
    df['updated'] = pd.to_datetime(df['updated'].tolist(),errors='coerce').astype('datetime64[ns]')
    df['age'] = (df['updated'] - df['created_at']).astype('timedelta64[D]').fillna(0).astype(np.int64)
    df['geo_located'] = df['geo_enabled'].apply(lambda x: 1 if x==x else 0)
    df['default_profile'] = df['default_profile'].apply(lambda x: 1 if x==x else 0)
    
    val = []
    for i in df['followers_count']:
        if i<30:
            val.append(1)
        else:
            val.append(0)
    df['less_than_30_followers'] = val
    
    val = []
    for i in df['statuses_count']:
        if i<50:
            val.append(1)
        else:
            val.append(0)
    df['less_than_50_tweets'] = val
    
    val = []
    for i in df['followers_count']:
        if i>1000:
            val.append(1)
        else:
            val.append(0)
    df['more_than_1000_followers'] = val
    
    val = []
    for i,j in zip(df['friends_count'], df['followers_count']):
        if j==0:
            j=1
        if i/j>=100:
            val.append(1)
        else:
            val.append(0)
    df['following_to_followers'] = val
    
    df = df.rename(index=str, columns={"statuses_count": "total_tweets", "friends_count": "total_following", 
                                       "followers_count": "total_followers", "favourites_count": "total_likes"})
    
    return df[['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'geo_located', 
               'default_profile', 'less_than_30_followers', 'less_than_50_tweets', 'more_than_1000_followers', 
               'following_to_followers']]


In [31]:
social1 = clean_df(social1)
social2 = clean_df(social2)
social3 = clean_df(social3)
traditional1 = clean_df(traditional1)
traditional2 = clean_df(traditional2)
fake = clean_df(fake)
genuine = clean_df(genuine)

In [32]:
combine = pd.concat([social1, social2, social3, traditional1, traditional2, fake])
bot = combine.sample(frac=1).reset_index(drop=True)

In [33]:
bot['Label'] = 'B'
genuine['Label'] = 'H'

In [34]:
bot = bot.sample(frac=1)
bot = bot.head(640)

In [35]:
combined_df = pd.concat([bot, genuine])
new_df = combined_df.sample(frac=1).reset_index(drop=True)
new_df.head()

Unnamed: 0,age,total_tweets,total_following,total_followers,total_likes,geo_located,default_profile,less_than_30_followers,less_than_50_tweets,more_than_1000_followers,following_to_followers,Label
0,992,10924,185,578,13651,0,0,0,0,0,0,H
1,1762,7863,70,106,3771,1,0,0,0,0,0,H
2,1448,49,0,0,0,0,0,1,1,0,0,B
3,1797,146442,1787,2282,10527,1,0,0,0,1,0,H
4,749,2083,109,110,5621,1,1,0,0,0,0,H


In [36]:
new_df.Label.value_counts()

H    3474
B     640
Name: Label, dtype: int64

In [37]:
data = new_df.drop(['Label'], axis = 1)
labels = pd.DataFrame(columns = new_df['Label'])
labels = labels.transpose()
labels.reset_index(inplace=True)

In [38]:
def scale_df(df):
    scaler = preprocessing.MinMaxScaler()
    names = ['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes']
    df[names] = scaler.fit_transform(df[names])
    return df

data = scale_df(data)
data.head()

Unnamed: 0,age,total_tweets,total_following,total_followers,total_likes,geo_located,default_profile,less_than_30_followers,less_than_50_tweets,more_than_1000_followers,following_to_followers
0,0.297006,0.02734,0.003995,0.000586,0.043481,0,0,0,0,0,0
1,0.527545,0.019679,0.001512,0.000107,0.012011,1,0,0,0,0,0
2,0.433533,0.000123,0.0,0.0,0.0,0,0,1,1,0,0
3,0.538024,0.366513,0.038588,0.002312,0.03353,1,0,0,0,1,0
4,0.224251,0.005213,0.002354,0.000111,0.017904,1,1,0,0,0,0


In [39]:
df_train, df_test = train_test_split(data, test_size=0.2, random_state=21)

test_index = list(df_test.index)
df_test = df_test.values

df_train.shape, df_test.shape

((3291, 11), (823, 11))

In [40]:
b = labels['Label'] == 'B'
h = labels['Label'] == 'H'

df_b = df_train[b]
df_h = df_train[h]

x_b = df_b.values
x_h = df_h.values

In [41]:
x_b.shape, x_h.shape

((507, 11), (2784, 11))

In [42]:
def Model():
    model = Sequential()
    model.add(Dense(128, input_dim=data.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(data.shape[1]))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [43]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, restore_best_weights=True)

In [44]:
def evaluate(model):
    return (np.sqrt(metrics.mean_squared_error(model.predict(x_b), x_b)),
            np.sqrt(metrics.mean_squared_error(model.predict(x_h), x_h)))

In [45]:
x_b_train, x_b_test = train_test_split(x_b, test_size=0.2, random_state=21)

b_model = Model()
b_model.fit(x_b_train, x_b_train, validation_data=(x_b_test, x_b_test), verbose=1, epochs=100, callbacks=[early_stopping])

evaluate(b_model)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Restoring model weights from the end of the best epoch.
Epoch 00062: early stopping


(0.009324896682563241, 0.038528377143327824)

In [46]:
x_h_train, x_h_test = train_test_split(x_h, test_size=0.2, random_state=21)

h_model = Model()
h_model.fit(x_h_train, x_h_train, validation_data=(x_h_test, x_h_test), verbose=1, epochs=100, callbacks=[early_stopping])

evaluate(h_model)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping


(0.05606625261668509, 0.003225138093944767)

In [47]:
models = [b_model, h_model]

In [48]:
def get_pred(df_test, models):
    pred_class = []
    for i in range(len(df_test)):
        loss = []
        x = df_test[i].reshape(1, 11)
        for model in models:
            loss.append(np.sqrt(metrics.mean_squared_error(model.predict(x), x)))
        pred_class.append(loss.index(min(loss)))
    return pred_class

In [49]:
def get_label(test_index):
    num_label = []
    for l in range(len(test_index)):
        _ = labels.values[test_index[l]][0][0]
        if _ == 'B': num_label.append(0)
        elif _ == 'H': num_label.append(1)
    return num_label

In [50]:
pred_correct = 0
num_label = get_label(test_index)
pred_class = get_pred(df_test, models)

for i in range(len(num_label)):
    if num_label[i] == pred_class[i]: pred_correct += 1

In [51]:
print('Accuracy:', pred_correct/len(num_label) * 100)

Accuracy: 96.47630619684082
