In [1]:
import importlib
import re
import numpy as np
import multiview as mv
import singleview as sv
import scipy
import pandas as pd
import ast #convert string list to list
import copy
from scipy import linalg, optimize
from numpy.linalg import multi_dot
from sklearn.svm import SVC
from sklearn.preprocessing import normalize



### Reading data

In [2]:
tweet_df = pd.read_csv('tweets_sample_preprocessed.csv', sep = '|')
tweet_df = tweet_df[tweet_df.UserID != 84165878]

### Feature extraction

In [3]:
"""Secondary functions"""
def count_phrase_freq(phrase, text):
    phrase = phrase.lower()
    text = text.lower()
    regex_obj = re.findall('\\b'+phrase+'\\b', text)
    if regex_obj:
        return len(regex_obj)
    else:
        return 0
spam_list = [line.rstrip('\n') for line in open('resources/spam_phrases.txt', 'r')]

def count_spam_phrases_per_tweet(spam_list, tweet):    
    count = 0
    for phrase in spam_list:
        count += count_phrase_freq(phrase, tweet)
    return count



#### Content-based features extraction

In [4]:
#add feature: num of mentions in tweet
tweet_df['NumOfMentions'] = tweet_df['Mention'].map(lambda x: len(ast.literal_eval(x)))


def retweet_rate(tweet_df):
    tweet_df['hasRetweet'] = tweet_df.Tweet.str.contains("^RE ")
    num_tweets_with_RT     = tweet_df.groupby('UserID')['hasRetweet'].sum()
    total_num_tweets       = tweet_df.groupby('UserID')['Tweet'].count()
    feature                = num_tweets_with_RT/total_num_tweets
    tweet_df.drop(columns='hasRetweet')
    return feature


def avg_length_of_tweet(tweet_df):
    tweet_df['Tweet_Length'] = tweet_df['Tweet'].str.len()
    tweet_length             = tweet_df.groupby('UserID')['Tweet_Length'].sum()
    num_of_tweets            = tweet_df.groupby('UserID')['Tweet_Length'].count()
    feature                  = tweet_length/num_of_tweets
    tweet_df.drop(columns='Tweet_Length', inplace=True)
    return feature

def avg_num_mentions_per_tweet(tweet_df):
    
    num_mentions_per_user = tweet_df.groupby('UserID')['NumOfMentions'].sum()
    num_tweets_per_user   = tweet_df.groupby('UserID')['Tweet'].count()
    feature               = num_mentions_per_user/num_tweets_per_user
    return feature

#count spam phrases in tweets, source: (https://blog.hubspot.com/blog/tabid/6307/bid/30684/the-ultimate-list-of-email-spam-trigger-words.aspx)
      
def avg_num_spam_phrases_per_tweet(tweet_df):
    tweet_df['NumSpamWords']  = list(map(lambda x: count_spam_phrases_per_tweet(spam_list, x), tweet_df.Tweet))
    sum_spam_phrases_per_user = tweet_df.groupby('UserID')['NumSpamWords'].sum()
    num_tweets_per_user       = tweet_df.groupby('UserID')['Tweet'].count()
    feature                   = sum_spam_phrases_per_user/num_tweets_per_user
    return feature
    
#tweet_df.drop(columns='NumOfMentions', inplace=True)   

#### Hashtag features extraction

In [5]:
#add feature: num of hashtags in tweet
tweet_df['NumOfHashtags'] = tweet_df.Hashtag.map(lambda x: len(ast.literal_eval(x)))

#average number of Hashtags per tweet
def avg_num_hashtags(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfHashtags'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

#
def avg_same_hashtag_count(tweet_df):
    tweet_df['isHashtagUnique']    = np.where(tweet_df['NumOfHashtags'] == 1, 1, 0)
    tweet_df['isHashtagDuplicate'] = np.where(tweet_df['NumOfHashtags'] > 1, 1, 0)
    num_unique_hashtags            = tweet_df.groupby('UserID')['isHashtagUnique'].sum()
    num_duplicate_hashtags         = tweet_df.groupby('UserID')['isHashtagDuplicate'].sum()
    total_tweet_count              = num_duplicate_hashtags = tweet_df.groupby('UserID')['Tweet'].count()
    feature = num_duplicate_hashtags/(num_unique_hashtags*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature


def num_hashtags_per_tweet(tweet_df):
    tweet_df['hasHashtag']     = tweet_df[tweet_df['NumOfHashtags'] > 0]
    total_tweet_count          = tweet_df.groupby('UserID')['Tweet'].count()
    num_tweets_with_hashtag    = tweet_df.groupby('UserID')['hasHashtag'].sum()
    feature = num_tweets_with_hashtag/total_tweet_count
    return feature
    


#tweet_df.drop(columns='NumOf#', inplace=True)

#### URL features extraction

In [6]:
#add feature: num of mentions in tweet
tweet_df['NumOfURLs'] = tweet_df['URL'].map(lambda x: len(ast.literal_eval(x)))


#average number of URLs per tweet
def avg_num_URLs(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfURLs'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

def avg_same_URL_count(tweet_df):
    tweet_df['isURLUnique']    = np.where(tweet_df['NumOfURLs'] == 1, 1, 0)
    tweet_df['isURLDuplicate'] = np.where(tweet_df['NumOfURLs'] > 1, 1, 0)
    num_unique_URLs            = tweet_df.groupby('UserID')['isURLUnique'].sum()
    num_duplicate_URLs         = tweet_df.groupby('UserID')['isURLDuplicate'].sum()
    total_tweet_count          = num_duplicate_URLs = tweet_df.groupby('UserID').Tweet.count()
    feature = num_duplicate_URLs/(num_unique_URLs*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature



#tweet_df.drop(columns='NumOfURLs#', inplace=True)

### Combining features into a single-view matrices

In [7]:
try:
    content_view_df = pd.read_csv(r'data/views_df_preprocessed/content_view_df.csv', sep = '|', index_col=0)
    URL_view_df     = pd.read_csv(r'data/views_df_preprocessed/URL_view_df.csv', sep = '|', index_col=0)
    hashtag_view_df = pd.read_csv(r'data/views_df_preprocessed/hashtag_view_df.csv', sep = '|', index_col=0)
except:
    #Content-based view
    content_view_df = pd.DataFrame(dict(AvgLengthOfTweets = avg_length_of_tweet(tweet_df), 
                                        RetweetRate       = retweet_rate(tweet_df),
                                        AvgNumMentions    = avg_num_mentions_per_tweet(tweet_df),
                                        AvgNumSpamPhrases = avg_num_spam_phrases_per_tweet(tweet_df)                                    
                                       ))

    #URL-based view
    URL_view_df = pd.DataFrame(dict(AvgNumURLs            = avg_num_URLs(tweet_df),
                                    AvgSameURLCount       = avg_same_URL_count(tweet_df)))

    #Hashtag-based view
    hashtag_view_df = pd.DataFrame(dict(AvgNumHashtags = avg_num_hashtags(tweet_df),
                                        AvgSamHashtagCount   = avg_same_hashtag_count(tweet_df)
                                       ))
    
    content_view_df.to_csv(r"data\views_df_preprocessed\content_view_df.csv", index= True, sep = '|')
    URL_view_df.to_csv(r"data\views_df_preprocessed\URL_view_df.csv", index= True, sep = '|')
    hashtag_view_df.to_csv(r"data\views_df_preprocessed\hashtag_view_df.csv", index= True, sep = '|')
    

#### Creating label matrix

In [8]:
users_legitimate_df = pd.read_csv('data\social_honeypot\legitimate_users.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])
users_polluters_df = pd.read_csv('data/social_honeypot/content_polluters.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])
tweet_df['isSpammer']    = np.where(tweet_df['UserID'].isin(list(users_polluters_df['UserID'])), -1, 0)
tweet_df['isLegitimate'] = np.where(tweet_df['UserID'].isin(list(users_legitimate_df['UserID'])), 1, 0)



class_label_df = tweet_df[['UserID','isLegitimate', 'isSpammer']].drop_duplicates(['UserID']).sort_values('UserID').set_index('UserID')
class_label_df = class_label_df[['isSpammer','isLegitimate']]


## Multiview Spam Detection Algorithm (MVSD)

In [9]:
importlib.reload(mv)
#content_view_df.AvgLengthOfTweets = content_view_df.AvgLengthOfTweets/content_view_df.AvgLengthOfTweets.max()
X_nv = [content_view_df, URL_view_df, hashtag_view_df]

#shuffle data points
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]

# normalize X
X_nv = [normalize(X, axis = 0,  norm = 'l1') for X in X_nv]

#transpose to correspond to the notations of dimensions used in the paper
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]

Y = np.array(class_label_df.sample(frac = 1, random_state = 2))
obj = mv.multiview(X = X_nv, Y = Y, num_components = 25 )
obj.solve(training_size=0.60, learning_rate= 0.01, alpha=0.1)


Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 2410880.25231217; Current Value: 1176712.0274736742
Iter:  2;   Old Value 1176712.0274736742; Current Value: 1176749.6709789275
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 1176749.6709789275; Current Value: 566946.4178695615
Iter:  2;   Old Value 566946.4178695615; Current Value: 566969.633773945
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;   Old Value 566969.633773945; Current Value: 9521.265343145165
Iter:  2;   Old Value 9521.265343145165; Current Value: 9544.328045162063
Iter:  3;   Old Value 9544.328045162063; Current Value: 9544.665230120776
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 9544.6652301

Iter:  4;   Old Value 4866.352557664005; Current Value: 4866.287315433803
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 4866.287315433803; Current Value: 4866.364881331209
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  3;   Old Value 4887.397759323787; Current Value: 4866.364881331209
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 4866.364881331209; Current Value: 4866.546570552808
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 4866.546570552808; Current Value: 4866.098488006227
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:

Iter:  80;   Old Value 4718.7786573495705; Current Value: 4717.465578069076
Iter:  81;   Old Value 4717.465578069076; Current Value: 4716.157977043284
Iter:  82;   Old Value 4716.157977043284; Current Value: 4714.855742480935
Iter:  83;   Old Value 4714.855742480935; Current Value: 4713.5587671049925
Iter:  84;   Old Value 4713.5587671049925; Current Value: 4712.266947924708
Iter:  85;   Old Value 4712.266947924708; Current Value: 4710.980186021272
Iter:  86;   Old Value 4710.980186021272; Current Value: 4709.698386346079
Iter:  87;   Old Value 4709.698386346079; Current Value: 4708.421457530759
Iter:  88;   Old Value 4708.421457530759; Current Value: 4707.149311708181
Iter:  89;   Old Value 4707.149311708181; Current Value: 4705.881864343728
Iter:  90;   Old Value 4705.881864343728; Current Value: 4704.619034076108
Iter:  91;   Old Value 4704.619034076108; Current Value: 4703.360742567139
Iter:  92;   Old Value 4703.360742567139; Current Value: 4702.106914359888
Iter:  93;   Old Value

Iter:  189;   Old Value 4596.515305893964; Current Value: 4595.53551973026
Iter:  190;   Old Value 4595.53551973026; Current Value: 4594.557813080067
Iter:  191;   Old Value 4594.557813080067; Current Value: 4593.58217840876
Iter:  192;   Old Value 4593.58217840876; Current Value: 4592.6086082764405
Iter:  193;   Old Value 4592.6086082764405; Current Value: 4591.637095335632
Iter:  194;   Old Value 4591.637095335632; Current Value: 4590.667632329036
Iter:  195;   Old Value 4590.667632329036; Current Value: 4589.700212087347
Iter:  196;   Old Value 4589.700212087347; Current Value: 4588.734827527148
Iter:  197;   Old Value 4588.734827527148; Current Value: 4587.771471648857
Iter:  198;   Old Value 4587.771471648857; Current Value: 4586.81013753473
Iter:  199;   Old Value 4586.81013753473; Current Value: 4585.85081834694
Iter:  200;   Old Value 4585.85081834694; Current Value: 4584.893507325689
Iter:  201;   Old Value 4584.893507325689; Current Value: 4583.938197787391
Iter:  202;   Old 

Iter:  298;   Old Value 4500.726347171118; Current Value: 4499.940926608269
Iter:  299;   Old Value 4499.940926608269; Current Value: 4499.157056561876
Iter:  300;   Old Value 4499.157056561876; Current Value: 4498.374733579026
Iter:  301;   Old Value 4498.374733579026; Current Value: 4497.593954219844
Iter:  302;   Old Value 4497.593954219844; Current Value: 4496.814715057327
Iter:  303;   Old Value 4496.814715057327; Current Value: 4496.037012677182
Iter:  304;   Old Value 4496.037012677182; Current Value: 4495.260843677681
Iter:  305;   Old Value 4495.260843677681; Current Value: 4494.4862046695
Iter:  306;   Old Value 4494.4862046695; Current Value: 4493.713092275576
Iter:  307;   Old Value 4493.713092275576; Current Value: 4492.94150313096
Iter:  308;   Old Value 4492.94150313096; Current Value: 4492.171433882675
Iter:  309;   Old Value 4492.171433882675; Current Value: 4491.402881189572
Iter:  310;   Old Value 4491.402881189572; Current Value: 4490.6358417222
Iter:  311;   Old Va

Iter:  406;   Old Value 4424.2112360953315; Current Value: 4423.575365892195
Iter:  407;   Old Value 4423.575365892195; Current Value: 4422.940726874423
Iter:  408;   Old Value 4422.940726874423; Current Value: 4422.307316452603
Iter:  409;   Old Value 4422.307316452603; Current Value: 4421.6751320423455
Iter:  410;   Old Value 4421.6751320423455; Current Value: 4421.044171064245
Iter:  411;   Old Value 4421.044171064245; Current Value: 4420.414430943847
Iter:  412;   Old Value 4420.414430943847; Current Value: 4419.785909111621
Iter:  413;   Old Value 4419.785909111621; Current Value: 4419.158603002925
Iter:  414;   Old Value 4419.158603002925; Current Value: 4418.53251005798
Iter:  415;   Old Value 4418.53251005798; Current Value: 4417.907627721832
Iter:  416;   Old Value 4417.907627721832; Current Value: 4417.283953444325
Iter:  417;   Old Value 4417.283953444325; Current Value: 4416.661484680073
Iter:  418;   Old Value 4416.661484680073; Current Value: 4416.040218888429
Iter:  419;

Iter:  514;   Old Value 4362.146174309284; Current Value: 4361.629189009366
Iter:  515;   Old Value 4361.629189009366; Current Value: 4361.113178880296
Iter:  516;   Old Value 4361.113178880296; Current Value: 4360.5981417321045
Iter:  517;   Old Value 4360.5981417321045; Current Value: 4360.0840753774755
Iter:  518;   Old Value 4360.0840753774755; Current Value: 4359.570977631727
Iter:  519;   Old Value 4359.570977631727; Current Value: 4359.058846309879
Iter:  520;   Old Value 4359.058846309879; Current Value: 4358.547679293939
Iter:  521;   Old Value 4358.547679293939; Current Value: 4358.037474432792
Iter:  522;   Old Value 4358.037474432792; Current Value: 4357.528229551847
Iter:  523;   Old Value 4357.528229551847; Current Value: 4357.019942475909
Iter:  524;   Old Value 4357.019942475909; Current Value: 4356.512610849003
Iter:  525;   Old Value 4356.512610849003; Current Value: 4356.006232466344
Iter:  526;   Old Value 4356.006232466344; Current Value: 4355.500805300149
Iter:  5

Iter:  1;   Old Value 4178.041878246954; Current Value: 4039.0813197154166
Iter:  2;   Old Value 4039.0813197154166; Current Value: 4048.023734263398
Iter:  3;   Old Value 4048.023734263398; Current Value: 4038.162300543779
Iter:  4;   Old Value 4038.162300543779; Current Value: 4038.1644459056556
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;   Old Value 4038.1644459056556; Current Value: 3898.587586456684
Iter:  2;   Old Value 3898.587586456684; Current Value: 3898.3126084554597
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 3898.3126084554597; Current Value: 3898.0330811397394
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  6;   Old Value 4315.360982892106; Current Value: 3898.0330811397394
---------------

Iter:  1;   Old Value 3859.6097626026426; Current Value: 3853.0912301816443
Iter:  2;   Old Value 3853.0912301816443; Current Value: 3853.086818085749
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 3853.086818085749; Current Value: 3852.7094288914723
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  9;   Old Value 3872.543200335463; Current Value: 3852.7094288914723
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 3852.7094288914723; Current Value: 3852.886984366479
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 3852.886984366479; Current Value: 3852.774773743896
---------------------------------------------------

Iter:  1;   Old Value 3847.6402742949554; Current Value: 3847.2593547025563
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  17;   Old Value 3847.93501394001; Current Value: 3847.2593547025563
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 3847.2593547025563; Current Value: 3847.165653942324
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 3847.165653942324; Current Value: 3847.0654346101737
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;   Old Value 3847.0654346101737; Current Value: 3846.9660448823406
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 3841.8901141288743; Current Value: 3841.795342206326
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 3841.795342206326; Current Value: 3841.7350645570477
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;   Old Value 3841.7350645570477; Current Value: 3841.636972850839
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 3841.636972850839; Current Value: 3841.261618969804
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  26;   Old Value 3841.8901141288743; Current Value: 3841.261618969804
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View


Iter:  1;   Old Value 3836.397655333831; Current Value: 3836.3009940052434
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 3836.3009940052434; Current Value: 3835.930517179122
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  34;   Old Value 3836.5883252939566; Current Value: 3835.930517179122
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 3835.930517179122; Current Value: 3835.836603148773
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 3835.836603148773; Current Value: 3835.7400174169225
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View


Iter:  1;   Old Value 3831.072932447435; Current Value: 3830.707204497234
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  42;   Old Value 3831.356706168355; Current Value: 3830.707204497234
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content View
Iter:  1;   Old Value 3830.707204497234; Current Value: 3830.614625583032
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 3830.614625583032; Current Value: 3830.5196074172886
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;   Old Value 3830.5196074172886; Current Value: 3830.42459243012
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Ite

Iter:  1;   Old Value 3825.555363031236; Current Value: 3825.4638384096866
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. URL View
Iter:  1;   Old Value 3825.4638384096866; Current Value: 3825.3703810522493
------------------------------------------------------------------------------------
Partial Optimisation w.r.t. Hashtag View
Iter:  1;   Old Value 3825.3703810522493; Current Value: 3825.2768392049957
------------------------------------------------------------------------------------
DONE updating U and V...
Updating V_star and W...


Iter:  1;   Old Value 3825.2768392049957; Current Value: 3824.9163142396264
DONE updating V_star and W...
Updating betas...


Done updating betas...
Calculating Global objective function value...


Iter:  51;   Old Value 3825.555363031236; Current Value: 3824.9163142396264
-------------------------------------------------------
Updating U and V...

Partial Optimisation w.r.t. Content V

In [11]:
for i in range(obj.Y_train.shape[0]):
    """PREDICTING USER'S COHORT"""
    w1_w2 = obj.W.dot(np.transpose(obj.V_star[i,:]))
    #print(np.sum(w1_w2))
    if (np.sum(w1_w2) < 0):
       # 
        obj.Y_predict_train[i,:] = np.array([-1., 0.])
    else:
        obj.Y_predict_train[i,:] = np.array([0., 1.])

"""CONFUSION MATRIX TP|FP
                    TN|FN
""" 
confusion_matrix = np.zeros((2,2))
TP = 0
FP = 0
FN = 0
TN = 0
for i in range(obj.Y_predict_train.shape[0]):
    if (np.array_equal(obj.Y_train[i,:], obj.Y_predict_train[i,:]))and(np.array_equal(obj.Y_predict_train[i,:], np.array([-1., 0.]))):
        TP += 1
    if (np.array_equal(obj.Y_train[i,:], obj.Y_predict_train[i,:]))and(np.array_equal(obj.Y_predict_train[i,:], np.array([0., 1.]))):
        TN += 1
    if (np.array_equal(obj.Y_train[i,:], np.array([-1., 0.])))and(np.array_equal(obj.Y_predict_train[i,:], np.array([0., 1.]))):
        FP += 1
    if (np.array_equal(obj.Y_train[i,:], np.array([0., 1.])))and(np.array_equal(obj.Y_predict_train[i,:], np.array([-1., 0.]))):
        FN += 1

confusion_matrix = pd.DataFrame(data = {'Actual_Spammer': [TP, FN], 'Actual_Legitimate': [FP, TN]}, index = ['Predicted_Spammer ','Predicted_Legitimate'])
precision        = TP/(TP+FP)
recall           = TP/(TP+FN)
F1_score         = 2*TP/(2*TP + FP + FN)


print(confusion_matrix, precision, recall, F1_score)



ZeroDivisionError: division by zero

In [16]:
for i in range(obj.Y_train.shape[0], obj.ground_truth.shape[0]):
    """PREDICTING USER'S COHORT"""
    w1_w2 = obj.W.dot(np.transpose(obj.V_star[i,:]))
    #print(np.sum(w1_w2))
    if (np.sum(w1_w2) < 0):
        obj.Y_predict_test[obj._training_size - i,:] = np.array([-1., 0.])
    else:
        obj.Y_predict_test[obj._training_size - i,:] = np.array([0., 1.])

"""CONFUSION MATRIX TP|FP
                    TN|FN
""" 
confusion_matrix = np.zeros((2,2))
TP = 0
FP = 0
FN = 0
TN = 0
for i in range(obj.Y_predict_test.shape[0]):
    if (np.array_equal(obj.Y_test[i,:], obj.Y_predict_test[i,:]))and(np.array_equal(obj.Y_predict_test[i,:], np.array([-1., 0.]))):
        TP += 1
    if (np.array_equal(obj.Y_test[i,:], obj.Y_predict_test[i,:]))and(np.array_equal(obj.Y_predict_test[i,:], np.array([0., 1.]))):
        TN += 1
    if (np.array_equal(obj.Y_test[i,:], np.array([-1., 0.])))and(np.array_equal(obj.Y_predict_test[i,:], np.array([0., 1.]))):
        FP += 1
    if (np.array_equal(obj.Y_test[i,:], np.array([0., 1.])))and(np.array_equal(obj.Y_predict_test[i,:], np.array([-1., 0.]))):
        FN += 1

confusion_matrix = pd.DataFrame(data = {'Actual_Spammer': [TP, FN], 'Actual_Legitimate': [FP, TN]}, index = ['Predicted_Spammer ','Predicted_Legitimate'])
precision        = TP/(TP+FP)
recall           = TP/(TP+FN)
F1_score         = 2*TP/(2*TP + FP + FN)


print(confusion_matrix, precision, recall, F1_score)

IndexError: index 12518 is out of bounds for axis 0 with size 12518

### Comparison with single-view approaches

#### Content view features

In [76]:
importlib.reload(sv)
X_nv = [content_view_df, URL_view_df, hashtag_view_df]
X_nv = [df.sample(frac = 1, random_state = 2) for df in X_nv]
#X_nv = [normalize(X, axis = 0,  norm = 'l1') for X in X_nv]
X_nv = [np.transpose(X_nv[v]) for v in range(len(X_nv))]
Y = np.array(class_label_df.sample(frac = 1, random_state = 2))

content_view_svm = sv.singleview(data = X_nv[0],  class_ = Y)
model_svm  = SVC(gamma = "auto")
precision, recall, F1_score, confusion_matrix_CV = content_view_svm.evaluate(model = model_svm, training_size=0.8)

Precision: 0.7645659928656362

Recall: 0.7737665463297232

F1-score: 0.7691387559808612

Confusion Matrix: [[1286  396]
 [ 376 1072]]

