In [1]:
import numpy as np
import pandas as pd

[MIB dataset](http://mib.projects.iit.cnr.it/dataset.html)

In [2]:
def add_tweet_props(user_df,tweets_df):
    user_df['recent_tweet_text'] = None
    for index,row in user_df.iterrows():
        # Get the tweets from this user 
        user_tweets = tweets_df.query(f'user_id == {row.id}')
        total_tweets = user_tweets.shape[0]
        retweets = user_tweets.query('retweeted_status_id > 0')
        total_retweet_posts = retweets.shape[0]
        replies = user_tweets.query('in_reply_to_status_id > 0')
        total_reply_posts = replies.shape[0]


        # Set how many tweets for this user are in the dataset
        user_df.at[index, 'num_tweets_dataset'] = total_tweets
        user_df.at[index, 'num_retweets_post_dataset'] = total_retweet_posts
        user_df.at[index, 'num_reply_post_dataset'] = total_reply_posts

        if total_tweets > 0:
            # Set ratio of tweet type
            user_df.at[index, 'retweet_post_percent'] = total_retweet_posts / total_tweets
            user_df.at[index, 'reply_post_percent'] = total_reply_posts / total_tweets

        # Set average number of various properties per tweet
        user_df.at[index, 'avg_hashtags'] = user_tweets['num_hashtags'].mean()
        user_df.at[index, 'avg_urls'] = user_tweets['num_urls'].mean()
        user_df.at[index, 'avg_mentions'] = user_tweets['num_mentions'].mean()
        user_df.at[index, 'avg_retweets_cnt'] = user_tweets['retweet_count'].mean()
        user_df.at[index, 'avg_reply_cnt'] = user_tweets['reply_count'].mean()
        user_df.at[index, 'avg_favorite_cnt'] = user_tweets['favorite_count'].mean()

        if total_tweets >= 25:
            # Get the 25 most recent tweets
            user_tweets['timestamp'] = pd.to_datetime(user_tweets['timestamp'])
            user_tweets.sort_values(by='timestamp',ascending=False,inplace=True)
            recent_tweet_text = user_tweets[:25]['text'].tolist()
            user_df.at[index, 'recent_tweet_text'] = recent_tweet_text


In [3]:
mib_fake_follower_users = pd.read_csv('../datasets/MIB/datasets_full.csv/fake_followers.csv/users.csv')
mib_fake_follower_users['identification'] = 'bot'
mib_fake_follower_tweets = pd.read_csv('../datasets/MIB/datasets_full.csv/fake_followers.csv/tweets.csv')
add_tweet_props(mib_fake_follower_users,mib_fake_follower_tweets)

In [4]:
mib_social_spambots_1_users = pd.read_csv('../datasets/MIB/datasets_full.csv/social_spambots_1.csv/users.csv')
mib_social_spambots_1_users['identification'] = 'bot'
mib_social_spambots_1_tweets = pd.read_csv('../datasets/MIB/datasets_full.csv/social_spambots_1.csv/tweets.csv')
add_tweet_props(mib_social_spambots_1_users,mib_social_spambots_1_tweets)

In [5]:
mib_social_spambots_2_users = pd.read_csv('../datasets/MIB/datasets_full.csv/social_spambots_2.csv/users.csv')
mib_social_spambots_2_users['identification'] = 'bot'
mib_social_spambots_2_tweets = pd.read_csv('../datasets/MIB/datasets_full.csv/social_spambots_2.csv/tweets.csv')
add_tweet_props(mib_social_spambots_2_users,mib_social_spambots_2_tweets)

In [6]:
mib_social_spambots_3_users = pd.read_csv('../datasets/MIB/datasets_full.csv/social_spambots_3.csv/users.csv')
mib_social_spambots_3_users['identification'] = 'bot'
mib_social_spambots_3_tweets = pd.read_csv('../datasets/MIB/datasets_full.csv/social_spambots_3.csv/tweets.csv')
add_tweet_props(mib_social_spambots_3_users,mib_social_spambots_3_tweets)

In [7]:
mib_traditional_spambots_1_users = pd.read_csv('../datasets/MIB/datasets_full.csv/traditional_spambots_1.csv/users.csv')
mib_traditional_spambots_1_users['identification'] = 'bot'
mib_traditional_spambots_1_tweets = pd.read_csv('../datasets/MIB/datasets_full.csv/traditional_spambots_1.csv/tweets.csv')
add_tweet_props(mib_traditional_spambots_1_users,mib_traditional_spambots_1_tweets)

In [8]:
mib_humans_users = pd.read_csv('../datasets/MIB/datasets_full.csv/genuine_accounts.csv/users.csv')
mib_humans_users['identification'] = 'human'
mib_humans_tweets = pd.read_csv('../datasets/MIB/datasets_full.csv/genuine_accounts.csv/tweets.csv',names=["id","text","source","user_id","truncated","in_reply_to_status_id","in_reply_to_user_id","in_reply_to_screen_name","retweeted_status_id","geo_x","geo_y","place","contributors","retweet_count","reply_count","favorite_count","favorited","retweeted","possibly_sensitive","num_hashtags","num_urls","num_mentions","created_at","timestamp","crawled_at","updated"], escapechar='\\', index_col=False)
add_tweet_props(mib_humans_users,mib_humans_tweets)

Combine all the users

In [9]:
users = pd.concat([mib_fake_follower_users,mib_social_spambots_1_users,mib_social_spambots_2_users,mib_social_spambots_3_users,mib_traditional_spambots_1_users,mib_humans_users])
users = users.query('num_tweets_dataset >= 25')

In [10]:
users['geo_enabled'].fillna(0,inplace=True)
users['default_profile'].fillna(0,inplace=True)
users['default_profile_image'].fillna(0,inplace=True)
users['verified'].fillna(0,inplace=True)


In [11]:
users.to_csv('../datasets/MIB/mib_processed.csv')