In [79]:
import importlib
import re
import numpy as np
import multiview as mv
import singleview as sv
import scipy
import pandas as pd
import ast #convert string list to list
import copy
from scipy import linalg, optimize
from numpy.linalg import multi_dot
from sklearn.svm import SVC
from sklearn.preprocessing import normalize



### Reading data

In [80]:
tweet_df = pd.read_csv('tweets_sample_preprocessed.csv', sep = '|')
tweet_df = tweet_df[tweet_df.UserID != 84165878]

### Feature extraction

In [81]:
"""Secondary functions"""
def count_phrase_freq(phrase, text):
    phrase = phrase.lower()
    text = text.lower()
    regex_obj = re.findall('\\b'+phrase+'\\b', text)
    if regex_obj:
        return len(regex_obj)
    else:
        return 0
spam_list = [line.rstrip('\n') for line in open('resources/spam_phrases.txt', 'r')]

def count_spam_phrases_per_tweet(spam_list, tweet):    
    count = 0
    for phrase in spam_list:
        count += count_phrase_freq(phrase, tweet)
    return count



#### Content-based features extraction

In [82]:
#add feature: num of mentions in tweet
tweet_df['NumOfMentions'] = tweet_df['Mention'].map(lambda x: len(ast.literal_eval(x)))


def retweet_rate(tweet_df):
    tweet_df['hasRetweet'] = tweet_df.Tweet.str.contains("^RE ")
    num_tweets_with_RT     = tweet_df.groupby('UserID')['hasRetweet'].sum()
    total_num_tweets       = tweet_df.groupby('UserID')['Tweet'].count()
    feature                = num_tweets_with_RT/total_num_tweets
    tweet_df.drop(columns='hasRetweet')
    return feature


def avg_length_of_tweet(tweet_df):
    tweet_df['Tweet_Length'] = tweet_df['Tweet'].str.len()
    tweet_length             = tweet_df.groupby('UserID')['Tweet_Length'].sum()
    num_of_tweets            = tweet_df.groupby('UserID')['Tweet_Length'].count()
    feature                  = tweet_length/num_of_tweets
    tweet_df.drop(columns='Tweet_Length', inplace=True)
    return feature

def avg_num_mentions_per_tweet(tweet_df):
    
    num_mentions_per_user = tweet_df.groupby('UserID')['NumOfMentions'].sum()
    num_tweets_per_user   = tweet_df.groupby('UserID')['Tweet'].count()
    feature               = num_mentions_per_user/num_tweets_per_user
    return feature

#count spam phrases in tweets, source: (https://blog.hubspot.com/blog/tabid/6307/bid/30684/the-ultimate-list-of-email-spam-trigger-words.aspx)
      
def avg_num_spam_phrases_per_tweet(tweet_df):
    tweet_df['NumSpamWords']  = list(map(lambda x: count_spam_phrases_per_tweet(spam_list, x), tweet_df.Tweet))
    sum_spam_phrases_per_user = tweet_df.groupby('UserID')['NumSpamWords'].sum()
    num_tweets_per_user       = tweet_df.groupby('UserID')['Tweet'].count()
    feature                   = sum_spam_phrases_per_user/num_tweets_per_user
    return feature
    
#tweet_df.drop(columns='NumOfMentions', inplace=True)   

#### Hashtag features extraction

In [83]:
#add feature: num of hashtags in tweet
tweet_df['NumOfHashtags'] = tweet_df.Hashtag.map(lambda x: len(ast.literal_eval(x)))

#average number of Hashtags per tweet
def avg_num_hashtags(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfHashtags'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

#
def avg_same_hashtag_count(tweet_df):
    tweet_df['isHashtagUnique']    = np.where(tweet_df['NumOfHashtags'] == 1, 1, 0)
    tweet_df['isHashtagDuplicate'] = np.where(tweet_df['NumOfHashtags'] > 1, 1, 0)
    num_unique_hashtags            = tweet_df.groupby('UserID')['isHashtagUnique'].sum()
    num_duplicate_hashtags         = tweet_df.groupby('UserID')['isHashtagDuplicate'].sum()
    total_tweet_count              = num_duplicate_hashtags = tweet_df.groupby('UserID')['Tweet'].count()
    feature = num_duplicate_hashtags/(num_unique_hashtags*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature


def num_hashtags_per_tweet(tweet_df):
    tweet_df['hasHashtag']     = tweet_df[tweet_df['NumOfHashtags'] > 0]
    total_tweet_count          = tweet_df.groupby('UserID')['Tweet'].count()
    num_tweets_with_hashtag    = tweet_df.groupby('UserID')['hasHashtag'].sum()
    feature = num_tweets_with_hashtag/total_tweet_count
    return feature
    


#tweet_df.drop(columns='NumOf#', inplace=True)

#### URL features extraction

In [84]:
#add feature: num of mentions in tweet
tweet_df['NumOfURLs'] = tweet_df['URL'].map(lambda x: len(ast.literal_eval(x)))


#average number of URLs per tweet
def avg_num_URLs(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfURLs'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

def avg_same_URL_count(tweet_df):
    tweet_df['isURLUnique']    = np.where(tweet_df['NumOfURLs'] == 1, 1, 0)
    tweet_df['isURLDuplicate'] = np.where(tweet_df['NumOfURLs'] > 1, 1, 0)
    num_unique_URLs            = tweet_df.groupby('UserID')['isURLUnique'].sum()
    num_duplicate_URLs         = tweet_df.groupby('UserID')['isURLDuplicate'].sum()
    total_tweet_count          = num_duplicate_URLs = tweet_df.groupby('UserID').Tweet.count()
    feature = num_duplicate_URLs/(num_unique_URLs*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature



#tweet_df.drop(columns='NumOfURLs#', inplace=True)

### Combining features into a single-view matrices

In [85]:
try:
    content_view_df = pd.read_csv(r'data/views_df_preprocessed/content_view_df.csv', sep = '|', index_col=0)
    URL_view_df     = pd.read_csv(r'data/views_df_preprocessed/URL_view_df.csv', sep = '|', index_col=0)
    hashtag_view_df = pd.read_csv(r'data/views_df_preprocessed/hashtag_view_df.csv', sep = '|', index_col=0)
except:
    #Content-based view
    content_view_df = pd.DataFrame(dict(AvgLengthOfTweets = avg_length_of_tweet(tweet_df), 
                                        RetweetRate       = retweet_rate(tweet_df),
                                        AvgNumMentions    = avg_num_mentions_per_tweet(tweet_df),
                                        AvgNumSpamPhrases = avg_num_spam_phrases_per_tweet(tweet_df)                                    
                                       ))

    #URL-based view
    URL_view_df = pd.DataFrame(dict(AvgNumURLs            = avg_num_URLs(tweet_df),
                                    AvgSameURLCount       = avg_same_URL_count(tweet_df)))

    #Hashtag-based view
    hashtag_view_df = pd.DataFrame(dict(AvgNumHashtags = avg_num_hashtags(tweet_df),
                                        AvgSamHashtagCount   = avg_same_hashtag_count(tweet_df)
                                       ))
    
    content_view_df.to_csv(r"data\views_df_preprocessed\content_view_df.csv", index= True, sep = '|')
    URL_view_df.to_csv(r"data\views_df_preprocessed\URL_view_df.csv", index= True, sep = '|')
    hashtag_view_df.to_csv(r"data\views_df_preprocessed\hashtag_view_df.csv", index= True, sep = '|')
    

#### Creating label matrix

In [86]:
users_legitimate_df = pd.read_csv('data\social_honeypot\legitimate_users.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])
users_polluters_df = pd.read_csv('data/social_honeypot/content_polluters.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])
tweet_df['isSpammer']    = np.where(tweet_df['UserID'].isin(list(users_polluters_df['UserID'])), -1, 0)
tweet_df['isLegitimate'] = np.where(tweet_df['UserID'].isin(list(users_legitimate_df['UserID'])), 1, 0)



class_label_df = tweet_df[['UserID','isLegitimate', 'isSpammer']].drop_duplicates(['UserID']).sort_values('UserID').set_index('UserID')
class_label_df = class_label_df[['isSpammer','isLegitimate']]


## Multiview Spam Detection Algorithm (MVSD)

In [213]:
importlib.reload(mv)
#content_view_df.AvgLengthOfTweets = content_view_df.AvgLengthOfTweets/content_view_df.AvgLengthOfTweets.max()
X_nv = [content_view_df, URL_view_df, hashtag_view_df]

#shuffle data points
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]

# normalize X
X_nv = [normalize(X, axis = 0,  norm = 'l1') for X in X_nv]

#transpose to correspond to the notations of dimensions used in the paper
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]

Y = np.array(class_label_df.sample(frac = 1, random_state = 2))
obj = mv.multiview(X = X_nv, Y = Y, num_components = 50 )
obj.solve(training_size=0.60, learning_rate= 0.01, alpha=0.1)


Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 8207373.174590135; Current Value: 3779508.0243218937
Iter:  2;   Old Value 3779508.0243218937; Current Value: 3780524.023227998
Iter:  3;   Old Value 3780524.023227998; Current Value: 3781028.2573956936
Iter:  4;   Old Value 3781028.2573956936; Current Value: 3781299.364199938
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 3781299.364199938; Current Value: 1583870.7184965704
Iter:  2;   Old Value 1583870.7184965704; Current Value: 1584433.1135721114
Iter:  3;   Old Value 1584433.1135721114; Current Value: 1584738.4122891957
Iter:  4;   Old Value 1584738.4122891957; Current Value: 1584915.8451587893
Iter:  5;   Old Value 1584915.8451587893; Current Value: 1585026.7740653127
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;  

Iter:  75;   Old Value 13334.65759752981; Current Value: 13331.950016126004
Iter:  76;   Old Value 13331.950016126004; Current Value: 13329.248038846283
Iter:  77;   Old Value 13329.248038846283; Current Value: 13326.551637408642
Iter:  78;   Old Value 13326.551637408642; Current Value: 13323.86078490023
Iter:  79;   Old Value 13323.86078490023; Current Value: 13321.175455880692
Iter:  80;   Old Value 13321.175455880692; Current Value: 13318.495626303
Iter:  81;   Old Value 13318.495626303; Current Value: 13315.821273497762
Iter:  82;   Old Value 13315.821273497762; Current Value: 13313.152375656142
Iter:  83;   Old Value 13313.152375656142; Current Value: 13310.48891232404
Iter:  84;   Old Value 13310.48891232404; Current Value: 13307.830863668461
Iter:  85;   Old Value 13307.830863668461; Current Value: 13305.178211276172
Iter:  86;   Old Value 13305.178211276172; Current Value: 13302.530937135514
Iter:  87;   Old Value 13302.530937135514; Current Value: 13299.889024139546
Iter:  88;

Iter:  181;   Old Value 13075.96475315784; Current Value: 13073.776099531027
Iter:  182;   Old Value 13073.776099531027; Current Value: 13071.591816133996
Iter:  183;   Old Value 13071.591816133996; Current Value: 13069.411894258721
Iter:  184;   Old Value 13069.411894258721; Current Value: 13067.236325212978
Iter:  185;   Old Value 13067.236325212978; Current Value: 13065.065100324831
Iter:  186;   Old Value 13065.065100324831; Current Value: 13062.898210940755
Iter:  187;   Old Value 13062.898210940755; Current Value: 13060.735648423102
Iter:  188;   Old Value 13060.735648423102; Current Value: 13058.577404150887
Iter:  189;   Old Value 13058.577404150887; Current Value: 13056.42346952152
Iter:  190;   Old Value 13056.42346952152; Current Value: 13054.273835950846
Iter:  191;   Old Value 13054.273835950846; Current Value: 13052.12849487252
Iter:  192;   Old Value 13052.12849487252; Current Value: 13049.987437735006
Iter:  193;   Old Value 13049.987437735006; Current Value: 13047.8506

Iter:  287;   Old Value 12866.691172163442; Current Value: 12864.920499594107
Iter:  288;   Old Value 12864.920499594107; Current Value: 12863.153364640748
Iter:  289;   Old Value 12863.153364640748; Current Value: 12861.389760254475
Iter:  290;   Old Value 12861.389760254475; Current Value: 12859.629679400388
Iter:  291;   Old Value 12859.629679400388; Current Value: 12857.87311505756
Iter:  292;   Old Value 12857.87311505756; Current Value: 12856.120060219006
Iter:  293;   Old Value 12856.120060219006; Current Value: 12854.370507891641
Iter:  294;   Old Value 12854.370507891641; Current Value: 12852.624451096297
Iter:  295;   Old Value 12852.624451096297; Current Value: 12850.881882867638
Iter:  296;   Old Value 12850.881882867638; Current Value: 12849.142796254177
Iter:  297;   Old Value 12849.142796254177; Current Value: 12847.40718431822
Iter:  298;   Old Value 12847.40718431822; Current Value: 12845.675040135857
Iter:  299;   Old Value 12845.675040135857; Current Value: 12843.946

Iter:  393;   Old Value 12697.393662001034; Current Value: 12695.961317943058
Iter:  394;   Old Value 12695.961317943058; Current Value: 12694.531837137618
Iter:  395;   Old Value 12694.531837137618; Current Value: 12693.105213875593
Iter:  396;   Old Value 12693.105213875593; Current Value: 12691.681442459214
Iter:  397;   Old Value 12691.681442459214; Current Value: 12690.260517202043
Iter:  398;   Old Value 12690.260517202043; Current Value: 12688.842432428932
Iter:  399;   Old Value 12688.842432428932; Current Value: 12687.427182476029
Iter:  400;   Old Value 12687.427182476029; Current Value: 12686.014761690723
Iter:  401;   Old Value 12686.014761690723; Current Value: 12684.60516443166
Iter:  402;   Old Value 12684.60516443166; Current Value: 12683.19838506868
Iter:  403;   Old Value 12683.19838506868; Current Value: 12681.794417982828
Iter:  404;   Old Value 12681.794417982828; Current Value: 12680.393257566337
Iter:  405;   Old Value 12680.393257566337; Current Value: 12678.994

Iter:  2;   Old Value 12610.004990343254; Current Value: 8974.69973372308
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 8974.69973372308; Current Value: 8959.060689252357
Iter:  2;   Old Value 8959.060689252357; Current Value: 8958.362088630012
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 8958.362088630012; Current Value: 8947.986027883187
Iter:  2;   Old Value 8947.986027883187; Current Value: 8948.534107399795
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;   Old Value 8948.534107399795; Current Value: 8938.2130010665
Iter:  2;   Old Value 8938.2130010665; Current Value: 8939.156630301077
Iter:  3;   Old Value 8939.156630301077; Current Value: 8939.806807390662
-----------------------------------------------

Iter:  97;   Old Value 8738.525857471004; Current Value: 8737.181973272161
Iter:  98;   Old Value 8737.181973272161; Current Value: 8735.846341959063
Iter:  99;   Old Value 8735.846341959063; Current Value: 8734.51882571256
Iter:  100;   Old Value 8734.51882571256; Current Value: 8733.19929015546
Iter:  101;   Old Value 8733.19929015546; Current Value: 8731.887603732499
Iter:  102;   Old Value 8731.887603732499; Current Value: 8730.58363761255
Iter:  103;   Old Value 8730.58363761255; Current Value: 8729.287266223557
Iter:  104;   Old Value 8729.287266223557; Current Value: 8727.998366789334
Iter:  105;   Old Value 8727.998366789334; Current Value: 8726.716819227227
Iter:  106;   Old Value 8726.716819227227; Current Value: 8725.442506016905
Iter:  107;   Old Value 8725.442506016905; Current Value: 8724.175312094716
Iter:  108;   Old Value 8724.175312094716; Current Value: 8722.915125206357
Iter:  109;   Old Value 8722.915125206357; Current Value: 8721.66183555913
Iter:  110;   Old Valu

Iter:  206;   Old Value 8622.88195158139; Current Value: 8622.008562898596
Iter:  207;   Old Value 8622.008562898596; Current Value: 8621.137507879079
Iter:  208;   Old Value 8621.137507879079; Current Value: 8620.268770104914
Iter:  209;   Old Value 8620.268770104914; Current Value: 8619.402333599171
Iter:  210;   Old Value 8619.402333599171; Current Value: 8618.53818228284
Iter:  211;   Old Value 8618.53818228284; Current Value: 8617.676300645957
Iter:  212;   Old Value 8617.676300645957; Current Value: 8616.816673321664
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  3;   Old Value 8974.69973372308; Current Value: 8616.816673321664
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 8616.816673321664; Current Value: 9256.322992549056
Iter:  2;   Old Value 9256.322992549056; Current Value: 8543.078687169733
Iter:  3; 

LinAlgError: Singular matrix

In [215]:
for i in range(obj.Y_train.shape[0]):
    """PREDICTING USER'S COHORT"""
    w1_w2 = obj.W.dot(np.transpose(obj.V_star[i,:]))
    #print(np.sum(w1_w2))
    if (np.sum(w1_w2) < 0):
       # 
        obj.Y_predict_train[i,:] = np.array([-1., 0.])
    else:
        obj.Y_predict_train[i,:] = np.array([0., 1.])

"""CONFUSION MATRIX TP|FP
                    TN|FN
""" 
confusion_matrix = np.zeros((2,2))
TP = 0
FP = 0
FN = 0
TN = 0
for i in range(obj.Y_predict_train.shape[0]):
    if (np.array_equal(obj.Y_train[i,:], obj.Y_predict_train[i,:]))and(np.array_equal(obj.Y_predict_train[i,:], np.array([-1., 0.]))):
        TP += 1
    if (np.array_equal(obj.Y_train[i,:], obj.Y_predict_train[i,:]))and(np.array_equal(obj.Y_predict_train[i,:], np.array([0., 1.]))):
        TN += 1
    if (np.array_equal(obj.Y_train[i,:], np.array([-1., 0.])))and(np.array_equal(obj.Y_predict_train[i,:], np.array([0., 1.]))):
        FP += 1
    if (np.array_equal(obj.Y_train[i,:], np.array([0., 1.])))and(np.array_equal(obj.Y_predict_train[i,:], np.array([-1., 0.]))):
        FN += 1

confusion_matrix = pd.DataFrame(data = {'Actual_Spammer': [TP, FN], 'Actual_Legitimate': [FP, TN]}, index = ['Predicted_Spammer ','Predicted_Legitimate'])
precision        = TP/(TP+FP)
recall           = TP/(TP+FN)
F1_score         = 2*TP/(2*TP + FP + FN)


print(confusion_matrix, precision, recall, F1_score)



                      Actual_Spammer  Actual_Legitimate
Predicted_Spammer               2809               1247
Predicted_Legitimate            1587               1867 0.6925542406311637 0.6389899909008189 0.6646947468054898


In [214]:
for i in range(obj.Y_train.shape[0], obj.ground_truth.shape[0]):
    """PREDICTING USER'S COHORT"""
    w1_w2 = obj.W.dot(np.transpose(obj.V_star[i,:]))
    #print(np.sum(w1_w2))
    if (np.sum(w1_w2) < 0):
        obj.Y_predict_test[obj._training_size - i,:] = np.array([-1., 0.])
    else:
        obj.Y_predict_test[obj._training_size - i,:] = np.array([0., 1.])

"""CONFUSION MATRIX TP|FP
                    TN|FN
""" 
confusion_matrix = np.zeros((2,2))
TP = 0
FP = 0
FN = 0
TN = 0
for i in range(obj.Y_predict_test.shape[0]):
    if (np.array_equal(obj.Y_test[i,:], obj.Y_predict_test[i,:]))and(np.array_equal(obj.Y_predict_test[i,:], np.array([-1., 0.]))):
        TP += 1
    if (np.array_equal(obj.Y_test[i,:], obj.Y_predict_test[i,:]))and(np.array_equal(obj.Y_predict_test[i,:], np.array([0., 1.]))):
        TN += 1
    if (np.array_equal(obj.Y_test[i,:], np.array([-1., 0.])))and(np.array_equal(obj.Y_predict_test[i,:], np.array([0., 1.]))):
        FP += 1
    if (np.array_equal(obj.Y_test[i,:], np.array([0., 1.])))and(np.array_equal(obj.Y_predict_test[i,:], np.array([-1., 0.]))):
        FN += 1

confusion_matrix = pd.DataFrame(data = {'Actual_Spammer': [TP, FN], 'Actual_Legitimate': [FP, TN]}, index = ['Predicted_Spammer ','Predicted_Legitimate'])
precision        = TP/(TP+FP)
recall           = TP/(TP+FN)
F1_score         = 2*TP/(2*TP + FP + FN)


print(confusion_matrix, precision, recall, F1_score)

                      Actual_Spammer  Actual_Legitimate
Predicted_Spammer               2647                  0
Predicted_Legitimate            2361                  0 1.0 0.5285543130990416 0.69157413455258


### Comparison with single-view approaches

#### Content view features

In [76]:
importlib.reload(sv)
X_nv = [content_view_df, URL_view_df, hashtag_view_df]
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]
#X_nv = [normalize(X, axis = 0,  norm = 'l1') for X in X_nv]
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]
Y = np.array(class_label_df.sample(frac = 1, random_state = 2))

content_view_svm = sv.singleview(data = X_nv[0],  class_ = Y)
model_svm  = SVC(gamma = "auto")
precision, recall, F1_score, confusion_matrix_CV = content_view_svm.evaluate(model = model_svm, training_size=0.8)

Precision: 0.7645659928656362

Recall: 0.7737665463297232

F1-score: 0.7691387559808612

Confusion Matrix: [[1286  396]
 [ 376 1072]]

