In [143]:
import numpy as np
import scipy
from scipy import linalg, optimize
from numpy.linalg import multi_dot
import pandas as pd
import ast #convert string list to list

### Reading data

In [2]:
tweet_df = pd.read_csv('tweets_sample_preprocessed.csv', sep = '|')

### Feature extraction

#### Content-based features extraction

In [3]:
#add feature: num of mentions in tweet
tweet_df['Mention'].astype('object')
tweet_df['NumOfMentions'] = tweet_df['Mention'].map(lambda x: len(ast.literal_eval(x)))


def retweet_rate(tweet_df):
    tweet_df['hasRetweet'] = tweet_df.Tweet.str.contains("^RE ")
    num_tweets_with_RT     = tweet_df.groupby('UserID')['hasRetweet'].sum()
    total_num_tweets       = tweet_df.groupby('UserID')['hasRetweet'].count()
    feature                = num_tweets_with_RT/total_num_tweets
    tweet_df.drop(columns='hasRetweet')
    return feature


def avg_length_of_tweet(tweet_df):
    tweet_df['Tweet_Length'] = tweet_df['Tweet'].str.len()
    tweet_length             = tweet_df.groupby('UserID')['Tweet_Length'].sum()
    num_of_tweets            = tweet_df.groupby('UserID')['Tweet_Length'].count()
    feature                  = tweet_length/num_of_tweets
    tweet_df.drop(columns='Tweet_Length', inplace=True)
    return feature

def avg_num_mentions_per_tweet(tweet_df):
    
    num_mentions_per_user = tweet_df.groupby('UserID')['NumOfMentions'].count()
    num_tweets_per_user   = tweet_df.groupby('UserID')['Tweet'].count()
    feature               = num_mentions_per_user/num_tweets_per_user
    return feature

#tweet_df.drop(columns='NumOfMentions', inplace=True)   

#### Hashtag features extraction

In [4]:
#add feature: num of hashtags in tweet
tweet_df['NumOfHashtags'] = tweet_df.Hashtag.map(lambda x: len(ast.literal_eval(x)))

#average number of Hashtags per tweet
def avg_num_hashtags(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfHashtags'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

#
def avg_same_hashtag_count(tweet_df):
    tweet_df['isHashtagUnique']    = np.where(tweet_df['NumOfHashtags'] == 1, 1, 0)
    tweet_df['isHashtagDuplicate'] = np.where(tweet_df['NumOfHashtags'] > 1, 1, 0)
    num_unique_hashtags            = tweet_df.groupby('UserID')['isHashtagUnique'].sum()
    num_duplicate_hashtags         = tweet_df.groupby('UserID')['isHashtagDuplicate'].sum()
    total_tweet_count              = num_duplicate_hashtags = tweet_df.groupby('UserID')['Tweet'].count()
    feature = num_duplicate_hashtags/(num_unique_hashtags*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature


def num_hashtags_per_tweet(tweet_df):
    tweet_df['hasHashtag']     = tweet_df[tweet_df['NumOfHashtags'] > 0]
    total_tweet_count          = tweet_df.groupby('UserID')['Tweet'].count()
    num_tweets_with_hashtag    = tweet_df.groupby('UserID')['hasHashtag'].sum()
    feature = num_tweets_with_hashtag/total_tweet_count
    return feature
    


#tweet_df.drop(columns='NumOf#', inplace=True)

#### URL features extraction

In [5]:
#add feature: num of mentions in tweet
tweet_df['NumOfURLs'] = tweet_df['URL'].map(lambda x: len(ast.literal_eval(x)))


#average number of URLs per tweet
def avg_num_URLs(tweet_df):
    count_URL_per_user    = tweet_df.groupby('UserID')['NumOfURLs'].sum()
    count_Tweets_per_user = tweet_df.groupby('UserID')['Tweet'].count()
    return count_URL_per_user/count_Tweets_per_user

def avg_same_URL_count(tweet_df):
    tweet_df['isURLUnique']    = np.where(tweet_df['NumOfURLs'] == 1, 1, 0)
    tweet_df['isURLDuplicate'] = np.where(tweet_df['NumOfURLs'] > 1, 1, 0)
    num_unique_URLs            = tweet_df.groupby('UserID')['isURLUnique'].sum()
    num_duplicate_URLs         = tweet_df.groupby('UserID')['isURLDuplicate'].sum()
    total_tweet_count          = num_duplicate_URLs = tweet_df.groupby('UserID').Tweet.count()
    feature = num_duplicate_URLs/(num_unique_URLs*total_tweet_count)
    feature = feature.replace(np.inf, 0)
    return feature



#tweet_df.drop(columns='NumOfURLs#', inplace=True)

### Combining features into a single-view matrices

In [317]:
#Content-based view
content_view_df = pd.DataFrame(dict(AvgLengthOfTweets = avg_length_of_tweet(tweet_df), 
                                    #RetweetRate       = retweet_rate(tweet_df),
                                    AvgNumMentions    = avg_num_mentions_per_tweet(tweet_df)
                                    
                                   ))

#URL-based view
URL_view_df = pd.DataFrame(dict(AvgNumURLs            = avg_num_URLs(tweet_df),
                                AvgSameURLCount       = avg_same_URL_count(tweet_df)))

#Hashtag-based view
hashtag_view_df = pd.DataFrame(dict(AvgNumHashtags = avg_num_hashtags(tweet_df),
                                    AvgSamHashtagCount   = avg_same_hashtag_count(tweet_df)
                                   ))

#### Unshorten URLs and extract domains and suffixes

In [10]:
def get_URL_domain(url, session):
    if pd.isnull(url):
        return ''
    else:
        try:            
            url_response = session.head(url, allow_redirects = True).url
            return tldextract.extract(url_response).domain
        except ConnectionError as e:
            err_url = re.search(r'host=\'([\w\-\.]+)\'', str(e))
            try:
                return tldextract.extract(err_url.group(1)).domain
            except:
                return err_url
        except MissingSchema as e:
            err_url = 'http://'+ re.search('http://([\w\-\.]+)?', str(e)).group(1)
            get_URL_domain(err_url, session)
        except:
            return url

#session = requests.Session()
#url1 = tweet_df['URL'].apply(lambda x: get_URL_domain(x, session))

In [318]:
users_legitimate_df = pd.read_csv('data\social_honeypot\legitimate_users.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])
users_polluters_df = pd.read_csv('data/social_honeypot/content_polluters.txt', 
                                  sep = '\t',
                                  names = ['UserID',
                                           'CreatedAt',
                                           'CollectedAt',
                                           'NumberOfFollowings',
                                           'NumberOfFollowers',
                                           'NumberOfTweets',
                                           'LengthOfScreenName',
                                           'LengthOfDescriptionInUserPro'])

tweet_df['isLegitimate'] = np.where(tweet_df['UserID'].isin(list(users_legitimate_df['UserID'])), 1, 0)
tweet_df['isSpammer'] = np.where(tweet_df['UserID'].isin(list(users_polluters_df['UserID'])), 1, 0)
class_label_df = tweet_df[['UserID','IsLegitimate', 'isSpammer']].drop_duplicates(['UserID']).sort_values('UserID').set_index('UserID')



## Algorithm

#### Initialization

In [645]:
"""Independent variables"""

#Multiview 
n_v           = 3                            #number of views
lambda_v      = np.ones(n_v)                 #regularisation coefficients
lambda_star_f = 1
lambda_f      = 1
beta          = np.array([-np.log(5), np.log(3), 2])         #view weights
eta           = 1                            #learning rate
K             = 2                            #number of latent features
N             = content_view_df.shape[0]     #number of users
U=U0          = [None]*n_v
V=V0          = [None]*n_v 


training_set_frac = .80
Y = np.array(class_label_df)     #labeled data matrix

#SVM
alpha = 1
W = np.zeros((2,K))
lambda_W = 1




#stack multiple-view feature matrices into list
X_nv = [content_view_df.values, URL_view_df.values, hashtag_view_df.values]
X_nv = [np.transpose(X_nv[v]) for v in range(n_v)]
for v in range(n_v):
    num_attr = X_nv[v].shape[0]
    U[v]     = np.random.random((num_attr, K))
    V[v]     = np.random.random((N, K))
    V_star   = np.random.random((N, K))


#normalize each view
X_nv = [X/scipy.linalg.norm(X, ord = 'fro') for X in X_nv]
U_old = U0
V_old = V0
    
    
def hinge_loss(z):
    if (z <= 0):
        return 1/2 - z
    elif (z >= 1):
        return 0
    else:
        return 1/2 * (1 - z)**2
    
def hinge_loss_derivative(z):
    if (z <= 0):
        return -z
    elif (z >= 1):
        return 0
    else:
        return z - 1
    

In [627]:
total_obj_func(beta, U, V, V_star, W, lambda_v, lambda_star_f, lambda_f)  

36352.632157084714

#### Optimisation

In [320]:
"""DEFINING OBJECTIVE FUNCTION
Total Objective Function is O = O_M + O_SVM
"""

def total_obj_func(beta, U, V, V_star, W, lambda_v, lambda_star_f, lambda_f):
    """Calculate Q from U and V"""
    Q = [None]*(n_v)
    #Q = np.zeros((n_v, 1))
    for v in range(n_v):
        diag_vector =  [sum(U[v][:,i]) for i in range(K)]   #i -column index 
        Q[v]        = np.diag(diag_vector)
    
    """Calculate multiview term O_M of the objective function"""
    term_1      = [X_nv[v] - np.linalg.multi_dot([U[v],
                                        np.linalg.inv(Q[v]), 
                                        Q[v], 
                                        np.transpose(V[v])]) 
                   for v in range (n_v)]
    term_1_norm = list(map(lambda X: scipy.linalg.norm(X, ord = 'fro')**2, term_1))
    term_2      = [V[v].dot(Q[v]) - V_star for v in range (n_v)]
    term_2_norm = list(map(lambda X: scipy.linalg.norm(X, ord = 'fro')**2, term_2))  
    term_3      = lambda_star_f/2 * np.linalg.norm(V_star, ord = 'fro')
    term_4      = [np.linalg.norm(U[v], ord = 'fro')**2 + np.linalg.norm(V[v], ord = 'fro')**2 for v in range (n_v)]
    
    O_M = 1/2 * np.sum(beta    * term_1_norm +   lambda_v * term_2_norm    ) + lambda_star_f * term_3 +lambda_f/2 * np.sum(term_4)
    
    
    """SVM Objective Function Term"""
    l = Y.shape[0]
    S = 0
    for i in range(l):
        S += hinge_loss(Y[i,:].dot(W.dot(np.transpose(V_star[i,:]))))

    O_SVM = alpha * S + lambda_W/2 * np.linalg.norm(W, ord = 'fro')
    
    return O_M + O_SVM

    """USE"""
#total_obj_func(beta, U, V, V_star, W, lambda_v, lambda_star_f, lambda_f)






In [650]:
def optimize_towards_U_and_V(U, V): 
    iter_count = 0
    max_iter = 1000
    func_val_old = 1e100
    func_val = total_obj_func(beta, U, V, V_star, W, lambda_v, lambda_star_f, lambda_f) 
    tolerance = 1e-5
               
    while (iter_count < max_iter) and (abs(func_val - func_val_old)/func_val > tolerance):
        iter_count += 1;
        func_val_old = func_val
        for v in range(n_v):           
            """UPDATE U"""
            A = lambda_v[v] * beta[v] * np.transpose(V[v]).dot(V_star)
            """TODO: Calculate coefficient B"""               
            numerator_U = beta[v]*(X_nv[v].dot(V[v])) 
            denominator_U = beta[v] * multi_dot([U[v], np.transpose(V[v]), V[v]])
            U[v] = U_old[v] * numerator_U/denominator_U

            U[v] = U[v]/scipy.linalg.norm(U[v], ord = 'fro')
            V[v] = V[v]/scipy.linalg.norm(U[v], ord = 'fro')

            """UPDATE V"""
            numerator_V = beta[v] * np.transpose(X_nv[v]).dot(U[v]) + lambda_v[v] * beta[v] * V_star
            denominator_V = beta[v] * multi_dot([V[v], np.transpose(U[v]), U[v]]) + lambda_v[v] * beta[v] * V[v] + lambda_f * V[v]
            V[v] = V_old[v] * numerator_V/denominator_V

            """Calculate new """  
            V_old[v] = V[v]
            U_old[v] = U[v]
            
        func_val = total_obj_func(beta, U, V, V_star, W, lambda_v, lambda_star_f, lambda_f)   
        print("Iter:  {};   Old Value {}; Current Value: {}".format(iter_count, func_val_old, func_val)) 
            
        
    return iter_count, func_val_old, func_val
            
            
optimize_towards_U_and_V(U, V)

Iter:  1;   Old Value 19588.90280623677; Current Value: 19588.732333802727


(1, 19588.90280623677, 19588.732333802727)

In [None]:
def optimize_towards_V_star_and_W(V_star, W):
    iter_count = 0
    max_iter = 1000
    func_val_old = 1e100
    func_val = total_obj_func(beta, U, V, V_star, W, lambda_v, lambda_star_f, lambda_f) 
    tolerance = 1e-5
               
    while (iter_count < max_iter) and (abs(func_val - func_val_old)/func_val > tolerance):
        iter_count += 1;
        func_val_old = func_val
        

            
        func_val = total_obj_func(beta, U, V, V_star, W, lambda_v, lambda_star_f, lambda_f)   
        print("Iter:  {};   Old Value {}; Current Value: {}".format(iter_count, func_val_old, func_val)) 
            
        
    return iter_count, func_val_old, func_val