In [77]:
import importlib
import re
import numpy as np
import multiview as mv
import singleview as sv
import scipy
import pandas as pd
import ast #convert string list to list
from scipy import linalg
from sklearn.preprocessing import normalize

### Reading data

In [78]:
tweet_df = pd.read_csv('data/tweets_sample_preprocessed.zip',compression = 'zip', sep = '|')
tweet_df = tweet_df[tweet_df.UserID != 84165878]

### Feature extraction

In [79]:
"""Secondary functions"""
def count_phrase_freq(phrase, text):
    phrase = phrase.lower()
    text = text.lower()
    regex_obj = re.findall('\\b'+phrase+'\\b', text)
    if regex_obj:
        return len(regex_obj)
    else:
        return 0
spam_list = [line.rstrip('\n') for line in open('spam_phrases.txt', 'r')]

def count_spam_phrases_per_tweet(spam_list, tweet):    
    count = 0
    for phrase in spam_list:
        count += count_phrase_freq(phrase, tweet)
    return count



#### Content-based features extraction

In [80]:
#add feature: num of mentions in tweet
tweet_df['NumOfMentions'] = tweet_df['Mention'].map(lambda x: len(ast.literal_eval(x)))


def retweet_rate(tweet_df):
    tweet_df['hasRetweet'] = tweet_df.Tweet.str.contains("^RE ")
    num_tweets_with_RT     = tweet_df.groupby('UserID')['hasRetweet'].sum()
    total_num_tweets       = tweet_df.groupby('UserID')['Tweet'].count()
    feature                = num_tweets_with_RT/total_num_tweets
    tweet_df.drop(columns='hasRetweet')
    return feature


def avg_length_of_tweet(tweet_df):
    tweet_df['Tweet_Length'] = tweet_df['Tweet'].str.len()
    tweet_length             = tweet_df.groupby('UserID')['Tweet_Length'].sum()
    num_of_tweets            = tweet_df.groupby('UserID')['Tweet_Length'].count()
    feature                  = tweet_length/num_of_tweets
    tweet_df.drop(columns='Tweet_Length', inplace=True)
    return feature

def avg_num_mentions_per_tweet(tweet_df):
    
    num_mentions_per_user = tweet_df.groupby('UserID')['NumOfMentions'].sum()
    num_tweets_per_user   = tweet_df.groupby('UserID')['Tweet'].count()
    feature               = num_mentions_per_user/num_tweets_per_user
    return feature

#count spam phrases in tweets, source: (https://blog.hubspot.com/blog/tabid/6307/bid/30684/the-ultimate-list-of-email-spam-trigger-words.aspx)
      
def avg_num_spam_phrases_per_tweet(tweet_df):
    tweet_df['NumSpamWords']  = list(map(lambda x: count_spam_phrases_per_tweet(spam_list, x), tweet_df.Tweet))
    sum_spam_phrases_per_user = tweet_df.groupby('UserID')['NumSpamWords'].sum()
    num_tweets_per_user       = tweet_df.groupby('UserID')['Tweet'].count()
    feature                   = sum_spam_phrases_per_user/num_tweets_per_user
    return feature
    
#tweet_df.drop(columns='NumOfMentions', inplace=True)   

#### Hashtag features extraction

In [81]:
#add feature: num of hashtags in tweet
tweet_df['NumOfHashtags'] = tweet_df.Hashtag.map(lambda x: len(ast.literal_eval(x)))

#average number of Hashtags per tweet
def avg_num_hashtags(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfHashtags'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

#
def avg_same_hashtag_count(tweet_df):
    tweet_df['isHashtagUnique']    = np.where(tweet_df['NumOfHashtags'] == 1, 1, 0)
    tweet_df['isHashtagDuplicate'] = np.where(tweet_df['NumOfHashtags'] > 1, 1, 0)
    num_unique_hashtags            = tweet_df.groupby('UserID')['isHashtagUnique'].sum()
    num_duplicate_hashtags         = tweet_df.groupby('UserID')['isHashtagDuplicate'].sum()
    total_tweet_count              = num_duplicate_hashtags = tweet_df.groupby('UserID')['Tweet'].count()
    feature = num_duplicate_hashtags/(num_unique_hashtags*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature


def num_hashtags_per_tweet(tweet_df):
    tweet_df['hasHashtag']     = tweet_df[tweet_df['NumOfHashtags'] > 0]
    total_tweet_count          = tweet_df.groupby('UserID')['Tweet'].count()
    num_tweets_with_hashtag    = tweet_df.groupby('UserID')['hasHashtag'].sum()
    feature = num_tweets_with_hashtag/total_tweet_count
    return feature
    


#tweet_df.drop(columns='NumOf#', inplace=True)

#### URL features extraction

In [82]:
#add feature: num of mentions in tweet
tweet_df['NumOfURLs'] = tweet_df['URL'].map(lambda x: len(ast.literal_eval(x)))


#average number of URLs per tweet
def avg_num_URLs(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfURLs'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

def avg_same_URL_count(tweet_df):
    tweet_df['isURLUnique']    = np.where(tweet_df['NumOfURLs'] == 1, 1, 0)
    tweet_df['isURLDuplicate'] = np.where(tweet_df['NumOfURLs'] > 1, 1, 0)
    num_unique_URLs            = tweet_df.groupby('UserID')['isURLUnique'].sum()
    num_duplicate_URLs         = tweet_df.groupby('UserID')['isURLDuplicate'].sum()
    total_tweet_count          = num_duplicate_URLs = tweet_df.groupby('UserID').Tweet.count()
    feature = num_duplicate_URLs/(num_unique_URLs*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature



#tweet_df.drop(columns='NumOfURLs#', inplace=True)

### Combining features into a single-view matrices

In [None]:
try:
    content_view_df = pd.read_csv(r'data/views_df_preprocessed/content_view_df.csv', sep = '|', index_col=0)
    URL_view_df     = pd.read_csv(r'data/views_df_preprocessed/URL_view_df.csv', sep = '|', index_col=0)
    hashtag_view_df = pd.read_csv(r'data/views_df_preprocessed/hashtag_view_df.csv', sep = '|', index_col=0)
except:
    #Content-based view
    content_view_df = pd.DataFrame(dict(AvgLengthOfTweets = avg_length_of_tweet(tweet_df), 
                                        RetweetRate       = retweet_rate(tweet_df),
                                        AvgNumMentions    = avg_num_mentions_per_tweet(tweet_df),
                                        AvgNumSpamPhrases = avg_num_spam_phrases_per_tweet(tweet_df)                                    
                                       ))

    #URL-based view
    URL_view_df = pd.DataFrame(dict(AvgNumURLs            = avg_num_URLs(tweet_df),
                                    AvgSameURLCount       = avg_same_URL_count(tweet_df)))

    #Hashtag-based view
    hashtag_view_df = pd.DataFrame(dict(AvgNumHashtags = avg_num_hashtags(tweet_df),
                                        AvgSamHashtagCount   = avg_same_hashtag_count(tweet_df)
                                       ))
    
    content_view_df.to_csv(r"data\views_df_preprocessed\content_view_df.csv", index= True, sep = '|')
    URL_view_df.to_csv(r"data\views_df_preprocessed\URL_view_df.csv", index= True, sep = '|')
    hashtag_view_df.to_csv(r"data\views_df_preprocessed\hashtag_view_df.csv", index= True, sep = '|')
    

#### Creating label matrix

In [None]:
users_legitimate_df = pd.read_csv('data\social_honeypot\legitimate_users.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])
users_polluters_df = pd.read_csv('data/social_honeypot/content_polluters.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])
tweet_df['isSpammer']    = np.where(tweet_df['UserID'].isin(list(users_polluters_df['UserID'])), -1, 0)
tweet_df['isLegitimate'] = np.where(tweet_df['UserID'].isin(list(users_legitimate_df['UserID'])), 1, 0)



class_label_df = tweet_df[['UserID','isLegitimate', 'isSpammer']].drop_duplicates(['UserID']).sort_values('UserID').set_index('UserID')
class_label_df = class_label_df[['isSpammer','isLegitimate']]


## Multiview Spam Detection Algorithm (MVSD)

In [None]:
importlib.reload(mv)
#content_view_df.AvgLengthOfTweets = content_view_df.AvgLengthOfTweets/content_view_df.AvgLengthOfTweets.max()
X_nv = [content_view_df, URL_view_df, hashtag_view_df]

#shuffle data points
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]

# normalize X
X_nv = [normalize(X, axis = 0,  norm = 'l1') for X in X_nv]

#transpose to correspond to the notations of dimensions used in the paper
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]

Y = np.array(class_label_df.sample(frac = 1, random_state = 2))
mvsd = mv.multiview(X = X_nv, Y = Y, num_components = 10 )
mvsd.solve(training_size=0.70, learning_rate= 0.001, alpha=0.01)



In [None]:
confusion_matrix, precision, recall, F1_score = mvsd.evaluate_train()
confusion_matrix_ = pd.DataFrame(data = {'Actual_Spammer': confusion_matrix[:,0], 'Actual_Legitimate': confusion_matrix[:,1]}, index = ['Predicted_Spammer ','Predicted_Legitimate'])
print(confusion_matrix_)
print("\n")
print("Precision: {}\n".format(precision))
print("Recall: {}\n".format(recall))
print("F1-score: {}\n".format(F1_score))

In [None]:
confusion_matrix, precision, recall, F1_score = mvsd.evaluate_test()
confusion_matrix_ = pd.DataFrame(data = {'Actual_Spammer': confusion_matrix[:,0], 'Actual_Legitimate': confusion_matrix[:,1]}, index = ['Predicted_Spammer ','Predicted_Legitimate'])
print(confusion_matrix_)
print("\n")
print("Precision: {}\n".format(precision))
print("Recall: {}\n".format(recall))
print("F1-score: {}\n".format(F1_score))

### Comparison with single-view approaches

#### Content view features

In [None]:
importlib.reload(sv)
X_nv = [content_view_df, URL_view_df, hashtag_view_df]
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]
Y = np.array(class_label_df.sample(frac = 1, random_state = 2))

content_view_svm = sv.singleview(data = X_nv[0],  class_ = Y)
model_svm  = SVC(gamma = "auto")
training_sizes = [0.30, 0.50, 0.80]
for s in training_sizes:
    print("---------------------------------------------------------------------")
    print("Training size: {}\n".format(s))
    precision, recall, F1_score, confusion_matrix_CV = content_view_svm.evaluate(model = model_svm, training_size=s)

#### URL view

In [None]:
importlib.reload(sv)
X_nv = [content_view_df, URL_view_df, hashtag_view_df]
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]
Y = np.array(class_label_df.sample(frac = 1, random_state = 2))

content_view_svm = sv.singleview(data = X_nv[1],  class_ = Y)
model_svm  = SVC(gamma = "auto")
training_sizes = [0.30, 0.50, 0.80]
for s in training_sizes:
    print("---------------------------------------------------------------------")
    print("Training size: {}\n".format(s))
    precision, recall, F1_score, confusion_matrix_CV = content_view_svm.evaluate(model = model_svm, training_size=s)

#### Hashtag View

In [None]:
importlib.reload(sv)
X_nv = [content_view_df, URL_view_df, hashtag_view_df]
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]
Y = np.array(class_label_df.sample(frac = 1, random_state = 2))

content_view_svm = sv.singleview(data = X_nv[2],  class_ = Y)
model_svm  = SVC(gamma = "auto")
training_sizes = [0.30, 0.50, 0.80]
for s in training_sizes:
    print("---------------------------------------------------------------------")
    print("Training size: {}\n".format(s))
    precision, recall, F1_score, confusion_matrix_CV = content_view_svm.evaluate(model = model_svm, training_size=s)

#### Concatenated features

In [None]:
importlib.reload(sv)

Y = np.array(class_label_df.sample(frac = 1, random_state = 2))
X = np.array(pd.concat(X_nv, axis=0))
content_view_svm = sv.singleview(data = X,  class_ = Y)
model_svm  = SVC(gamma = "auto")
training_sizes = [0.30, 0.50, 0.80]
for s in training_sizes:
    print("---------------------------------------------------------------------")
    print("Training size: {}\n".format(s))
    precision, recall, F1_score, confusion_matrix_CV = content_view_svm.evaluate(model = model_svm, training_size=s)
 