# 0. Initialize

## 0.1. Import Libraries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, glob
import gzip
import random
import tqdm
import json
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
from datetime import datetime, timezone

from IPython import display
import matplotlib as mpl
from matplotlib import pyplot as plt

## 0.2. DEFINE VARIABLES 

In [2]:
DATA_PATH = '/Users/sudekipri/Downloads/data/' # '<insert-your-training-data-path-here>'

ROUND = 3 # This project will have 3 rounds of predictions: 1,2,3
STUDENT_ID = '28368'#'<insert-your-id-here>'
PROJECT_CODE = 'CS4124d092904f01e'#'<insert-your-code-here>' # Same code for the annotation eg. CS412xxxxx

## 0.3. Read Training & Evaluation Data

### 0.3.1. Get the labels for tweets

In [3]:
#trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH))
trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str})
trainingTweetDf

Unnamed: 0,tweet_id,isPolitical
0,1597170281545551872,Yes
1,1431700027471192069,No
2,1566035577090281472,Yes
3,1591538690869940225,Yes
4,1583898169238167554,Yes
...,...,...
2995,1593539327623151619,Yes
2996,1393886554062524418,No
2997,1597925615092764672,Yes
2998,1585291418616176640,Yes


In [4]:
trainingTweetDf.isPolitical.value_counts()

Yes    2003
No      997
Name: isPolitical, dtype: int64

### 0.3.2. Get the labels for users

In [5]:
trainingUserDf = pd.read_csv('{}training-user.csv'.format(DATA_PATH))
#trainingUserDf = pd.read_csv('training-user.csv')
trainingUserDf

Unnamed: 0,screen_name,isBot
0,koftecancaddy,No
1,ahaber,No
2,selahat03949652,No
3,erdin06357062,No
4,bhct__necatii,No
...,...,...
2995,djblumenberg,No
2996,mel1sq,No
2997,eren_yz1,Yes
2998,ergnyildiz4,No


In [6]:
trainingUserDf.isBot.value_counts()

No     2424
Yes     576
Name: isBot, dtype: int64

### 0.3.3. Expand your dataset with metadata and tweets

In [7]:
# You can also expand training data by downloading your own labeled datasets following the link
# Download the documents under "Link to training data"

print('http://www.onurvarol.com/Annotation-CS412-202201/reports/report_{}.html'.format(PROJECT_CODE))

http://www.onurvarol.com/Annotation-CS412-202201/reports/report_CS4124d092904f01e.html


# 1. EXTRACT FEATURES
Under *1.1. Political Tweet Detection* and *1.2. Bot Detection*, we firstly collect raw data for processing. We then combine some of them (total_interactions = num_favorites + num_retweets) or use them to extract features (whether the tweet has one of the political entities @meralaksener, @kilicdarogluk etc.).

We expect you to collect more raw data from **tweet_metadata**, **user_profiles** and **user_tweets** files by creating a function as shown in below examples such as *check_if_retweet()* and using it while iterating over data as shown under *Merge Collected Features*.

We also expect you to create new variables as much as you can from the data in order to make your predictions more accurate. For example, you may want to check:

- The tweet sources that a user frequently uses
- Whether the user is a verified account or not

...

to assess whether **a user is a bot or not** and whether **a tweet is political or not**.

In [8]:
PATH_TO_DOWNLOADED = DATA_PATH # 'D:/Users/sudekipri/Downloads/data/'

## 1.1. Political Tweet Detection
This part stands for the feature extraction of tweets. We start with collecting the raw data from *tweet_metadata*, then use some of them to extract features.

### 1.1.1. Get Raw Data

#### 1.1.1.1. Check if Retweet

In [9]:
def check_if_retweet(tweet_metadata_line):
    is_retweet = 0
    retweeted_username = None

    try:
        tweet_metadata_line['retweeted_status']
        retweeted_username = tweet_metadata_line['retweeted_status']['user']['screen_name'].lower()
        is_retweet = 1

    except KeyError:
        pass

    return is_retweet

#### 1.1.1.2. Get Tweet Text

In [10]:
def get_tweet_text(tweet_metadata_line):
    text = tweet_metadata_line['text']
    
    return text

#### 1.1.1.3. Get Tweet ID

In [11]:
def get_tweet_id(tweet_metadata_line):
    id_str = tweet_metadata_line['id_str']
    
    return id_str

#### 1.1.1.4. Get Number of Mentions and Hashtags

In [12]:
def get_number_mentions_hashtags(tweet_metadata_line):
    num_mentions = len(tweet_metadata_line['entities']['user_mentions'])
    num_hashtags = len(tweet_metadata_line['entities']['hashtags'])

    return num_mentions, num_hashtags

#### 1.1.1.5. Get Number of Retweets and Favorites

In [13]:
def get_number_retweets_favorites(tweet_metadata_line):
    retweet_count = tweet_metadata_line['retweet_count']
    favorite_count = tweet_metadata_line['favorite_count']
    
    return retweet_count, favorite_count

#### 1.1.1.6. Get User Info

In [14]:
def get_user_info(tweet_metadata_line):
    id = tweet_metadata_line['user']['id_str']
    screen_name = tweet_metadata_line['user']['screen_name'].lower()
    description = tweet_metadata_line['user']['description']

    return id, screen_name, description

#### 1.1.1.7. Check if the User is Verified

In [15]:
def check_if_verified(tweet_metadata_line):
    verified = tweet_metadata_line['user']['verified']
    if verified == 'true':
        verified = 1
    else:
        verified = 0
    return verified

#### 1.1.1.8. Check if the Tweet is a Quote Tweet, Reply and Get Number of Friends and Followers

In [16]:
def check_if_quote(tweet_metadata_line):
    quote = tweet_metadata_line['is_quote_status']
    if quote == 'true':
        quote = 1
    else:
        quote = 0
    return quote

In [17]:
def check_if_reply(tweet_metadata_line):
    is_reply = 0
    parent_username = None

    try:
        parent_username = tweet_metadata_line['in_reply_to_screen_name']
        if parent_username:
            parent_username = parent_username.lower()
            is_reply = 1

    except KeyError:
        pass

    return is_reply

In [18]:
def get_friends_followers(tweet_metadata_line):
    friends = tweet_metadata_line['user']['friends_count']
    followers = tweet_metadata_line['user']['followers_count']

    return friends, followers

#### 1.1.1.9. Get the Numbers of the URLs, Symbols and Statuses.... Get Age of the Account, Listed Count and Location

In [19]:
def get_number_urls(tweet_metadata_line):
    num_urls = len(tweet_metadata_line['entities']['urls'])
        
    return num_urls

In [20]:
def get_number_symbols(tweet_metadata_line):
    num_symbols = len(tweet_metadata_line['entities']['symbols'])
        
    return num_symbols

In [21]:
def get_number_statuses(tweet_metadata_line):
    statuses_count = tweet_metadata_line['user']['statuses_count']
        
    return statuses_count

In [22]:
def get_age(tweet_metadata_line):
    str_age = tweet_metadata_line['user']['created_at'] 
    age = datetime.strptime(str_age, '%a %b %d %H:%M:%S %z %Y')
    return age

In [23]:
def get_listed_count(tweet_metadata_line):
    listedCount = tweet_metadata_line['user']['listed_count']
    return listedCount

In [24]:
def get_user_location(tweet_metadata_line):
    location = tweet_metadata_line['user']['location']

    return location.lower()

### 1.1.2. Derive Manually Crafted Features

#### 1.1.2.1. Check for political entity in text, get the numbers and special characters in the username

In [25]:
def check_political_ent(text):
    
    # the list below can be modified and some new names may be added (or removed)
    list_of_entities = ['meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag',
                        'eczozgurozel', 'ekrem_imamoglu', 'mansuryavas06', 'suleymansoylu', 'T_Karamollaoglu',
                        'erkbas', 'tuncsoyer', 'dbdevletbahceli', 'bybekirbozdag', 'NureddinNebati', 'mkulunk',
                        'drfahrettinkoca', 'alibabacan', 'ahmet_davutoglu', 'erbakanfatih', 'akp', 'chp', 'ak parti', 'mhp',
                        'iyi parti', 'hdp', 'zafer partisi', 'atama', 'atanma', 'ekonomi', 'reis']
    
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return number_entities

In [26]:
def UsernameNumberCount(displayName):
    numberCount = 0
    
    for i in displayName:
        if i.isnumeric():
            numberCount += 1

    return numberCount

In [27]:
def UsernameSpecialCharacterCount(displayName):
    specialCount = 0
    
    for i in displayName:
        if not i.isalpha():
            specialCount += 1

    return specialCount

#### 1.1.2.2. Number of total interactions, check the activeness of the account, friends to followers ratio

In [28]:
def total_interactions(retweet_count, favorite_count):
    total_num_interactions = retweet_count + favorite_count
    
    return total_num_interactions

In [29]:
def TweetsPerDay(tweetCount, age):
    now = datetime.now(timezone.utc)
    deltaTime = now-age

    return tweetCount/deltaTime.days

In [30]:
def FriendsVFollowers(friends, followers):
    return (friends+1)/(followers+1)

### 1.1.2. Collect data using the functions above and transform into a Pandas DataFrame

In [31]:
dfPolitical = {'tweet_id':[],
              'is_retweet':[],
              'text':[],
              'num_mentions':[],
              'num_hashtags':[],
              'num_retweets':[],
              'num_favorites':[],
              'user_id':[],
              'is_quote':[],
              'num_statuses':[],
              'num_urls':[],
              'num_symbols':[],
              'is_verified':[],
              'user_screen_name':[],
              'friend_ratio':[],
              'account_activeness':[],
              'user_description':[],
              'num_political_entities':[],
              'total_interactions':[],
              'account_age':[],
              'location':[],
              'total_num_in_username':[],
              'total_special_char_in_username':[],
              'tweets_listed_count':[],
              'friends':[],
              'followers':[],
              'is_reply':[]}


with open(f"{PATH_TO_DOWNLOADED}tweet_metadata.jsons", "rb") as f:
    for line in f:
        line = json.loads(line)
        
        # raw data:
        id_str = get_tweet_id(line)
        is_retweet = check_if_retweet(line)
        text = get_tweet_text(line)
        num_mentions, num_hashtags = get_number_mentions_hashtags(line)
        retweet_count, favorite_count = get_number_retweets_favorites(line)
        user_id_str, screen_name, user_description = get_user_info(line)
        quote = check_if_quote(line)
        location = get_user_location(line)
        num_urls = get_number_urls(line)
        num_symbols = get_number_symbols(line)
        num_statuses = get_number_statuses(line)
        account_age = get_age(line)
        verified = check_if_verified(line)
        friend_count, follower_count = get_friends_followers(line)
        tweets_listed_count = get_listed_count(line)
        is_reply = check_if_reply(line)

        

        # manually crafted data:
        num_political_entities = check_political_ent(text)
        total_num_interactions = total_interactions(retweet_count, favorite_count)
        total_num_in_username = UsernameNumberCount(screen_name)
        friend_ratio = FriendsVFollowers(friend_count, follower_count)
        total_special_char_in_username = UsernameSpecialCharacterCount(screen_name)
        account_activeness = TweetsPerDay(num_statuses, account_age)
        dfPolitical['tweet_id'].append(id_str)
        dfPolitical['is_retweet'].append(is_retweet)
        dfPolitical['text'].append(text)
        dfPolitical['num_mentions'].append(num_mentions)
        dfPolitical['num_hashtags'].append(num_hashtags)
        dfPolitical['num_retweets'].append(retweet_count)
        dfPolitical['num_favorites'].append(favorite_count)
        dfPolitical['user_id'].append(user_id_str)
        dfPolitical['user_screen_name'].append(screen_name)
        dfPolitical['user_description'].append(user_description)
        dfPolitical['num_political_entities'].append(num_political_entities)
        dfPolitical['is_quote'].append(quote)
        dfPolitical['friend_ratio'].append(friend_ratio)
        dfPolitical['num_urls'].append(num_urls)
        dfPolitical['num_symbols'].append(num_symbols)
        dfPolitical['num_statuses'].append(num_statuses)
        dfPolitical['is_verified'].append(verified)
        dfPolitical['location'].append(location)
        dfPolitical['friends'].append(friend_count)
        dfPolitical['followers'].append(follower_count)
        dfPolitical['account_activeness'].append(account_activeness)
        dfPolitical['total_interactions'].append(total_num_interactions)
        dfPolitical['total_num_in_username'].append(total_num_in_username)
        dfPolitical['total_special_char_in_username'].append(total_special_char_in_username)
        dfPolitical['tweets_listed_count'].append(tweets_listed_count)
        dfPolitical['account_age'].append(account_age)
        dfPolitical['is_reply'].append(is_reply)

In [32]:
dfPolitical = pd.DataFrame(dfPolitical)
dfPolitical

Unnamed: 0,tweet_id,is_retweet,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,is_quote,num_statuses,num_urls,num_symbols,is_verified,user_screen_name,friend_ratio,account_activeness,user_description,num_political_entities,total_interactions,account_age,location,total_num_in_username,total_special_char_in_username,tweets_listed_count,friends,followers,is_reply
0,1588568792984346624,0,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,0,2638,1,0,0,maviruh_,1.894737,1.373958,shu/\nburaya afilli bir söz yazdığımı varsayın,0,147,2017-10-19 10:43:55+00:00,ankara,0,1,0,539,284,0
1,1588452263047069697,0,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,0,6647,0,0,0,mtfdan,4.469697,1.710499,,0,0,2012-05-31 14:08:26+00:00,,0,0,3,589,131,1
2,1569589330544398336,0,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,0,2924,1,0,0,ahsucilginuzman,1.950820,4.072423,Vatan Sevdalisi,0,0,2021-02-01 22:56:06+00:00,,0,0,0,118,60,0
3,1570428119609139201,0,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,0,1783,1,0,0,hamitelkelle,23.333333,4.692105,HighOne,0,0,2022-01-05 17:08:49+00:00,,0,0,0,69,2,1
4,1551163840368414722,0,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,0,7559,1,0,0,sed58417690,1.391667,26.155709,,1,0,2022-04-07 07:58:42+00:00,,8,8,0,166,119,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33528,1568595408233832448,0,Gerçek kimlik taşımayan hesaplara cevap vermem...,0,0,9,81,576247173,0,41342,1,0,0,ardanzenturk,0.024201,10.578813,RT ONAYLADIĞIM ANLAMINA GELMEZ\nArtık fikirler...,0,90,2012-05-10 11:47:23+00:00,,0,0,379,4147,171399,0
33529,1584027427696959488,0,@umitozdag Neden Suriyelilerle ilgili bu kadar...,1,0,1,8,162308585,0,52340,1,0,0,ozgul_61,0.819914,11.415485,Bridge design engineer Yaay hesabı : dilfiruz,1,9,2010-07-03 07:58:43+00:00,payi̇taht 🇹🇷,2,3,4,3227,3936,1
33530,1585945783307730945,0,@celebimehmeta Niye Türkiye yüzyılıda.Türkiye ...,1,0,0,1,415025519,0,1225,0,0,0,ladrekova,3.385246,0.300098,,0,1,2011-11-17 20:15:55+00:00,,0,0,0,412,121,1
33531,1569748909521801221,1,RT @muazzezeralp: @Doan58213655 @denizkonur @N...,7,1,6,0,1442125177727307781,0,153819,0,0,0,yapikytgrivrlsn,1.695946,319.790021,,1,6,2021-09-26 13:53:55+00:00,,0,0,0,1003,591,0


## 1.2. From Users

### 1.2.1. Get user metadata from user_profiles.jsons.gz

#### 1.2.1.1. Get user info metadata

In [33]:
def get_user_info_metadata(user_metadata_line):
    
    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']
    user_statuses_count = user_metadata_line['statuses_count']
    user_is_verified = user_metadata_line['verified']
    user_favourites_count = user_metadata_line['favourites_count']
    user_has_default_photo = user_metadata_line['default_profile_image']
    listed_count = user_metadata_line['listed_count']
    
    dictionary = {'user_id':user_id, 'user_name': user_name, 'user_screen_name':user_screen_name, 'user_location':user_location,
     'user_description':user_description, 'user_followers_count':user_followers_count, 'user_has_default_photo': user_has_default_photo, 'user_is_verified':user_is_verified, 'user_friends_count':user_friends_count, 'user_statuses_count': user_statuses_count, 'user_favourites_count': user_favourites_count, 'listed_count': listed_count}

    return dictionary

#### 1.2.1.2. Get followers/(followers+friends) ratio

In [34]:
def get_followers_all_ratio(user_followers_count, user_friends_count):
    
    if user_friends_count + user_followers_count == 0:
        followers_all_ratio = 0

    else:
        followers_all_ratio =  user_followers_count / (user_friends_count + user_followers_count)

    return followers_all_ratio

#### 1.2.1.3. Get description length, check if the user has a photo, is verified and get the age

In [35]:
def get_desc_len(user_description):
    
    description_len = len(user_description)

    return description_len

In [36]:
def user_has_photo(user_has_default_photo):
   if user_has_default_photo == 'true':
       user_has_default_photo = 1
   else:
       user_has_default_photo = 0
       
   return user_has_default_photo

In [37]:
def user_is_verified(user_is_verified):
    if user_is_verified == 'true':
        user_is_verified = 1
    else:
        user_is_verified = 0
    return user_is_verified

In [38]:
def get_age_user(tweet_metadata_line):
    str_age = tweet_metadata_line['created_at'] 
    age = datetime.strptime(str_age, '%a %b %d %H:%M:%S %z %Y')

    return age

In [39]:
dfBot = {'user_id':[],
         'user_name':[],
         'user_screen_name':[],
         'user_location':[],
         'user_description':[],
         'user_followers_count':[],
         'user_friends_count':[],
         'user_statuses_count':[],
         'user_favourites_count':[],
         'description_len':[],
         'followers_to_all_ratio':[],
         'user_is_verified':[],
         'user_has_default_photo':[],
         'listed_count':[],
         'total_num_in_username':[],
         'account_age':[],
         'account_activeness':[],
         'UsernameSpecialCharacterCount':[]}

with open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons", "rb") as f:
    for line in f:
        line = json.loads(line)

        dictionary = get_user_info_metadata(line)
        for k,v in dictionary.items():
            dfBot[k].append(v)
        account_age = get_age_user(line)

        dfBot['account_age'].append(account_age)
        
        # manually crafted data:
        description_len = get_desc_len(dictionary['user_description'])
        total_num_in_username = UsernameNumberCount(screen_name)
        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'], 
                                                      dictionary['user_friends_count'])
        account_activeness = TweetsPerDay(dictionary['user_statuses_count'], account_age)
        specialCharacters = UsernameSpecialCharacterCount(dictionary['user_screen_name'])
        
        dfBot['description_len'].append(description_len)
        dfBot['followers_to_all_ratio'].append(followers_all_ratio)
        dfBot['account_activeness'].append(account_activeness)
        dfBot['total_num_in_username'].append(total_num_in_username)
        dfBot['UsernameSpecialCharacterCount'].append(specialCharacters)
        

In [40]:
dfBot = pd.DataFrame(dfBot)
dfBot

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,user_statuses_count,user_favourites_count,description_len,followers_to_all_ratio,user_is_verified,user_has_default_photo,listed_count,total_num_in_username,account_age,account_activeness,UsernameSpecialCharacterCount
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,2551,17676,7,0.260000,False,False,0,8,2021-08-27 13:07:30+00:00,4.992172,3
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,42771,15474,19,0.732260,False,False,11,8,2020-09-11 08:45:44+00:00,49.618329,0
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,14300,18220,64,0.192308,False,False,0,8,2019-04-10 18:15:31+00:00,10.354815,8
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,21303,26999,65,0.325203,False,False,5,8,2016-01-29 11:01:25+00:00,8.357395,0
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,1629,2179,23,0.505051,False,False,0,8,2013-12-01 18:16:41+00:00,0.488163,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,2396,10820,0,0.513453,False,False,0,8,2020-10-26 21:08:22+00:00,2.936275,1
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,75178,36671,100,0.975088,False,False,279,8,2010-02-03 18:39:01+00:00,15.880439,0
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,6482,7389,28,0.451362,False,False,0,8,2009-05-01 13:56:23+00:00,1.293296,1
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,121113,140095,116,0.314431,False,False,1,8,2014-03-14 18:05:09+00:00,37.449907,0


### 1.2.2. Get Tweet Info of Users in user_profiles.jsons.gz

#### 1.2.2.1. Check ratio of retweets to all tweets

In [41]:
def get_retweet_tweet_ratio(line):
    number_retweets = 0
    number_original_tweets = 0

    for tweet in line['tweets']:
        try:
            tweet['retweeted_status']
            number_retweets += 1
                
        except:
            number_original_tweets += 1
            
    total_tweets = number_retweets + number_original_tweets
    
    if total_tweets == 0:
        retweet_total_ratio = None
    else:
        retweet_total_ratio = number_retweets/(total_tweets)
    
    return retweet_total_ratio

#### 1.2.2.2. Check median number of favorites

In [42]:
def get_median_number_favorites(line):
    num_median_favorites = np.median([tweet['favorite_count'] for tweet in line['tweets']])

    return num_median_favorites

### 1.2.3. Collect data using the functions above and transform into a Pandas DataFrame

In [43]:
dfBotTweets = {'user_id':[],
               'retweet_total_ratio':[],
               'num_median_favorites':[],
               'num_of_tweets':[]
              }

i = 0

with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    for line in f:

        line = json.loads(line)

        user_id = line['user_id']
        dfBotTweets['user_id'].append(user_id)
        
        retweet_total_ratio = get_retweet_tweet_ratio(line)
        dfBotTweets['retweet_total_ratio'].append(retweet_total_ratio)
        
        num_median_favorites = get_median_number_favorites(line)
        dfBotTweets['num_median_favorites'].append(num_median_favorites)
        
        dfBotTweets['num_of_tweets'].append(len(line['tweets']))

        i += 1
        if i % 1000 == 0:
            print(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


In [44]:
dfBotTweets = pd.DataFrame(dfBotTweets)
dfBotTweets

Unnamed: 0,user_id,retweet_total_ratio,num_median_favorites,num_of_tweets
0,594642154,0.115000,2.0,200
1,525600289,0.005025,1.0,199
2,931895965501534209,0.900000,0.0,200
3,1591543462746329088,0.185000,0.0,200
4,734801354749796352,1.000000,0.0,200
...,...,...,...,...
28310,1591370361488252928,0.800000,0.0,200
28311,1475272459616235525,0.825000,0.0,200
28312,1096753792731750401,0.051020,1.0,196
28313,1269527617687953409,0.095000,2.0,200


### 1.2.3. Merge dfBot and dfBotTweets

In [45]:
dfBotAll = dfBot.merge(dfBotTweets,
                       how='left')

dfBotAll[['retweet_total_ratio', 'num_median_favorites']] = dfBotAll[['retweet_total_ratio', 'num_median_favorites']].fillna(0)

dfBotAll

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,user_statuses_count,user_favourites_count,description_len,followers_to_all_ratio,user_is_verified,user_has_default_photo,listed_count,total_num_in_username,account_age,account_activeness,UsernameSpecialCharacterCount,retweet_total_ratio,num_median_favorites,num_of_tweets
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,2551,17676,7,0.260000,False,False,0,8,2021-08-27 13:07:30+00:00,4.992172,3,0.395939,0.0,197.0
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,42771,15474,19,0.732260,False,False,11,8,2020-09-11 08:45:44+00:00,49.618329,0,0.125000,0.0,200.0
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,14300,18220,64,0.192308,False,False,0,8,2019-04-10 18:15:31+00:00,10.354815,8,0.910000,0.0,200.0
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,21303,26999,65,0.325203,False,False,5,8,2016-01-29 11:01:25+00:00,8.357395,0,0.015306,1.0,196.0
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,1629,2179,23,0.505051,False,False,0,8,2013-12-01 18:16:41+00:00,0.488163,0,0.659898,0.0,197.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,2396,10820,0,0.513453,False,False,0,8,2020-10-26 21:08:22+00:00,2.936275,1,0.015000,1.0,200.0
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,75178,36671,100,0.975088,False,False,279,8,2010-02-03 18:39:01+00:00,15.880439,0,0.291457,2.0,199.0
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,6482,7389,28,0.451362,False,False,0,8,2009-05-01 13:56:23+00:00,1.293296,1,0.061538,0.0,195.0
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,121113,140095,116,0.314431,False,False,1,8,2014-03-14 18:05:09+00:00,37.449907,0,0.995000,0.0,200.0


# 2. TRAIN MODEL

## 2.1. Political Tweet Prediction

### 2.1.1. Merge dfPolitical data with labels

In [46]:
dfPoliticalAll_train = dfPolitical.merge(trainingTweetDf,
                                         on='tweet_id')

dfPoliticalAll_train.head()

Unnamed: 0,tweet_id,is_retweet,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,is_quote,num_statuses,num_urls,num_symbols,is_verified,user_screen_name,friend_ratio,account_activeness,user_description,num_political_entities,total_interactions,account_age,location,total_num_in_username,total_special_char_in_username,tweets_listed_count,friends,followers,is_reply,isPolitical
0,1585955683513798656,0,@AvOzlemZengin YüzüncüYıla YakışanGenelAf adli...,1,0,3,2,1564992353168941058,0,25730,0,0,0,zehra78231638,0.513514,181.197183,,0,5,2022-08-31 15:04:18+00:00,,8,8,0,113,221,1,Yes
1,1597631718479261696,0,#TCYüzyılıÜcretliÖgrtKadro\n#TCYüzyılıÜcretliÖ...,0,2,30,28,1324630334416297985,0,12427,1,0,0,nurozguler,0.923246,15.418114,,0,58,2020-11-06 08:31:04+00:00,"i̇zmir, türkiye",0,0,0,420,455,0,Yes
2,1572522789948751874,0,Ekrem İmamoğlu davayı değerlendirdi. 'Boş işle...,0,0,5,66,407597071,0,161123,1,0,0,onediocom,1.8e-05,39.375122,Türkiye'nin ilk ve tek sosyal içerik sitesi ht...,0,71,2011-11-08 08:39:55+00:00,türkiye,0,0,679,12,735643,0,Yes
3,1591412481561624577,0,Sayın Bakanım @suleymansoylu POMEM önlisans er...,1,0,0,0,1394789887073738753,0,1595,1,0,0,buckybarnestr,4.181818,2.606209,...,1,0,2021-05-18 23:00:15+00:00,,0,0,0,45,10,0,Yes
4,1596914274907348992,0,"@varank Sayın bakanım, Bodrumdaki bu araziyi ...",1,0,0,0,1586083256088371201,0,1251,0,0,0,sayariahmet,1.0,14.892857,,0,0,2022-10-28 19:52:07+00:00,,0,0,0,6,6,1,Yes


### 2.1.2. Separate X and y values
We only use 3 features here to create a baseline model. However, it is not enough to get good results.

In [47]:
X = dfPoliticalAll_train[['num_political_entities','total_interactions','num_hashtags', 'num_symbols', 'num_urls', 'num_mentions', 'num_retweets', 'num_favorites', 'friends', 'followers', 'total_num_in_username', 'tweets_listed_count', 'num_statuses', 'is_verified', 'is_quote', 'is_reply', 'is_retweet', 'account_activeness', 'friend_ratio']]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

### 2.1.3. Train - validation split

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.1.4. Train the model

Here, you may use different models such as neural networks, XGBoost, AdaBoost, RandomForest, Linear Regression, Logistic Regression etc. to see which model does the best. Also, you can use grid_search_cv() or a basic for loop to optimize the hyperparameters of your model.

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, make_scorer
import xgboost as xgb

mse = make_scorer(mean_squared_error, greater_is_better=False)
params = {
    'max_depth': range(3, 10, 2),
    'min_child_weight':range(1,6,2),
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'gamma':[i/10.0 for i in range(0,5)]
}

# create an instance
xgb_reg = xgb.XGBRegressor(
    objective = 'binary:logistic'
)

# grid search the model
grid_search_political = GridSearchCV(estimator = xgb_reg, param_grid= params, n_jobs = 4, cv = 5, verbose = True, scoring=mse)

# fit your model
grid_search_political.fit(X_train, y_train)

grid_search_political.best_estimator_

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


In [54]:
# make predictions
preds = grid_search_political.predict(X_valid)

# evaluate on validation set
mse_political = mean_squared_error(y_valid, preds)

print("MSE:", mse_political)

MSE: 0.16270240423864488


## 2.2. Bot Detection

### 2.2.1. Merge dfBotAll data with labels

In [55]:
dfBotAll.user_screen_name = dfBotAll.user_screen_name.str.lower()

In [56]:
dfBotAll_train = dfBotAll.merge(trainingUserDf,
                               left_on='user_screen_name',
                               right_on='screen_name')

dfBotAll_train

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,user_statuses_count,user_favourites_count,description_len,followers_to_all_ratio,user_is_verified,user_has_default_photo,listed_count,total_num_in_username,account_age,account_activeness,UsernameSpecialCharacterCount,retweet_total_ratio,num_median_favorites,num_of_tweets,screen_name,isBot
0,1512081815292432394,sezgin,sezgin953116371,,,46,430,1535,9627,0,0.096639,False,True,0,8,2022-04-07 14:56:21+00:00,5.329861,9,0.050251,0.0,199.0,sezgin953116371,No
1,1425452291428077571,Adem Koç,gogoadem61,,,14,171,113,74,0,0.075676,False,False,0,8,2021-08-11 13:42:03+00:00,0.214421,2,0.761062,0.0,113.0,gogoadem61,No
2,328164303,Necmettin Balıkçı,dewil511,,,21,49,219,25,0,0.300000,False,False,1,8,2011-07-02 21:19:26+00:00,0.051896,3,0.010101,0.0,198.0,dewil511,Yes
3,1343666971368431622,Night Bird⁷🦉,midnight__bird,,"La vie est un sommeil, l’amour en est le rêve...",422,260,15191,84933,48,0.618768,False,False,9,8,2020-12-28 21:16:19+00:00,20.173971,2,0.085000,1.0,200.0,midnight__bird,No
4,1240932880488038400,Samed Pınarcı,samedpinarci,,Orman Mühendisi - Orman İşletme Şefi - Orman G...,133,202,3734,15783,60,0.397015,False,False,0,8,2020-03-20 09:27:20+00:00,3.600771,0,0.780000,0.0,200.0,samedpinarci,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1286770207134973954,Hamide Arabacı,anka6054,,,151,61,5288,5347,0,0.712264,False,False,0,8,2020-07-24 21:08:34+00:00,5.810989,4,0.000000,1.0,200.0,anka6054,No
2996,1598032338323214338,atamabekleyenbahceci,atamabekleyenzz,,,173,367,311,196,0,0.320370,False,False,0,8,2022-11-30 19:13:03+00:00,6.098039,0,0.580000,0.0,200.0,atamabekleyenzz,No
2997,760235343966863360,Emrah İNCİ,memrahinci,Istanbul - Bayburt,Researcher | Middle East | Political Science |...,5863,5905,1029,725,71,0.498215,False,False,0,8,2016-08-01 22:06:45+00:00,0.435463,0,0.040000,36.0,200.0,memrahinci,No
2998,1553973684100124672,Murat Kkk,muratkkk18,,Normal sıradan bir insanım,1,10,18,38,26,0.090909,False,False,0,8,2022-08-01 05:19:56+00:00,0.104046,2,0.769231,0.0,13.0,muratkkk18,No


In [57]:
trainingUserDf.isBot.value_counts()

No     2424
Yes     576
Name: isBot, dtype: int64

### 2.2.2. Separate X and y values
We use only 4 features here to create a baseline model. However, it is not enough to get good results.

In [58]:
X = dfBotAll_train[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites', 'user_followers_count', 'user_friends_count', 'num_of_tweets', 'user_statuses_count', 'user_favourites_count', 'user_has_default_photo', 'user_is_verified', 'listed_count', 'UsernameSpecialCharacterCount']]
y = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

### 2.2.3. Train-test split

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.2.4. Train the model

In [60]:
mse = make_scorer(mean_squared_error, greater_is_better=False)
params = {
    'max_depth': range(3, 10, 2),
    'min_child_weight':range(1,6,2),
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'gamma':[i/10.0 for i in range(0,5)]
}

# create an instance
xgb_reg = xgb.XGBRegressor(
    objective='binary:logistic'
)

# grid search the model
grid_search_bot = GridSearchCV(estimator = xgb_reg, param_grid= params, n_jobs = 4, cv = 5, verbose = True, scoring=mse)

# fit your model
grid_search_bot.fit(X_train, y_train)

grid_search_bot.best_estimator_

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


In [61]:
# make predictions
preds = grid_search_bot.predict(X_valid)

mse_bot = mean_squared_error(y_valid, preds)

print("MSE:", mse_bot)

MSE: 0.12759678299948457


# 3. MAKE PREDICTIONS

Here, you will make predictions with the models that you have trained above.

## 3.1. Predictions for Tweets (Political or Not)

In [62]:
# read the evaluation file as follows
evaluationTweetDf = pd.read_csv('{}evaluation-round{}-tweet.csv'.format(DATA_PATH, ROUND), dtype={0: str}, header=None, names=['tweet_id'])
evaluationTweetDf = evaluationTweetDf.dropna()
evaluationTweetDf

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfPolitical_test = dfPolitical.merge(evaluationTweetDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfPolitical_test[['num_political_entities','total_interactions','num_hashtags', 'num_symbols', 'num_urls', 'num_mentions', 'num_retweets', 'num_favorites', 'friends', 'followers', 'total_num_in_username', 'tweets_listed_count', 'num_statuses', 'is_verified', 'is_quote', 'is_reply', 'is_retweet', 'account_activeness', 'friend_ratio']]

# make predictions based on these variables
predictions_political = grid_search_political.predict(X)
#predictions_political_binary = (predictions_political > 0.5).astype(int)

### This part is important! We expect you to return your predictions in the following format:

In [63]:
modelPredTweet = dict([(x,float(y)) for x,y in zip([*dfPolitical_test.tweet_id], predictions_political)])
modelPredTweet

{'1434787703783051264': 0.08863162994384766,
 '1367571642604544000': 0.552153468132019,
 '1589993032975544320': 0.827063262462616,
 '1565312596135354373': 0.959409773349762,
 '1579558096833511424': 0.9622066617012024,
 '1439547067337256967': 0.6179110407829285,
 '1559963768372740098': 0.9744182825088501,
 '1562853131251118081': 0.7711088061332703,
 '1586021183958704128': 0.37216004729270935,
 '1585766233491886081': 0.9466023445129395,
 '1427746815420604417': 0.13753816485404968,
 '1352635736537882629': 0.9192801713943481,
 '1415032260571680768': 0.42709314823150635,
 '1548636597628899328': 0.976285457611084,
 '1564926450096013313': 0.20389819145202637,
 '1585634359612420101': 0.9442832469940186,
 '1597138789108895744': 0.6992308497428894,
 '1391681495622995971': 0.05149704962968826,
 '1389951943343316995': 0.09824992716312408,
 '1452348722810138646': 0.7927939891815186,
 '1595829502021623812': 0.7209789156913757,
 '1413108476348354562': 0.09835157543420792,
 '1579408398894137344': 0.56

## 3.2. Predictions for Users (Bot or Not)

In [64]:
evaluationUserDf = pd.read_csv('{}evaluation-round{}-user.csv'.format(DATA_PATH, ROUND), dtype={0: str}, header=None, names=['user_screen_name'])
evaluationUserDf = evaluationUserDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfBot_test = dfBotAll.merge(evaluationUserDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfBot_test[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites', 'user_followers_count', 'user_friends_count', 'num_of_tweets', 'user_statuses_count', 'user_favourites_count', 'user_has_default_photo', 'user_is_verified', 'listed_count', 'UsernameSpecialCharacterCount']]

# make predictions based on these variables
predictions_bot = grid_search_bot.predict(X)
#predictions_bot_binary = (predictions_bot > 0.5).astype(int)

In [65]:
modelPredUser = dict([(x,float(y)) for x,y in zip([*dfBot_test.user_screen_name], predictions_bot)])
modelPredUser

{'biologselim': 0.5778451561927795,
 'omerakdag34': 0.026165897026658058,
 'bilgin21604923': 0.39957481622695923,
 '_sydneycarton_': 0.030275676399469376,
 'denizlihabercom': 0.01461754459887743,
 'burakerbaychp': 0.0027628252282738686,
 'mvnez': 0.028512228280305862,
 'qara118': 0.015255783684551716,
 'nabiyonyevrum': 0.02010774053633213,
 'farukhalit2': 0.06682301312685013,
 'harlunoshi': 0.12569516897201538,
 'heritagepaix': 0.017760789021849632,
 'nuranwolf': 0.024941593408584595,
 'politikgundem': 0.18848557770252228,
 'isakethudax': 0.0036067632026970387,
 'enveraysevera': 0.07026198506355286,
 'ilaydejaneiro': 0.12816491723060608,
 '1905anason': 0.402643620967865,
 'eraydurgut03': 0.055439818650484085,
 'dasiskein': 0.0673208013176918,
 'ercan_bas29': 0.11211752146482468,
 'mett_1907': 0.026196470484137535,
 'ondemir066': 0.29389244318008423,
 'semihyeteer': 0.16124778985977173,
 'haberinyokcokk': 0.025056779384613037,
 'meleky_ozaydin': 0.042539145797491074,
 'mehmetaltay64': 0

# PREPARE SUBMISSION

You will need to submit exact same file produced by using the following code. Any deviation from the desired format willbe marked as 0.

In [66]:
# Explain your approach

data_explanations = ''' The collection and the organization of data were already performed. During the annotations, I performed data acquisition. Here on the notebook, we removed the missing (NA) values. I converted is_verified, is_quote feautures and is_reply to booleans. If it's verified the dataset gives a true but I decided to use 1 for true in order to normalize the data to a common scale and transformed these true/false values into a suitable format. I performed this method for other true/false values. After that I split the data into training and testing sets. The portions are mentioned on the model explanation part. I performed data splitting: for political tweet detection, X training values = 'num_political_entities','total_interactions','num_hashtags', 'num_symbols', 'num_urls', 'num_mentions', 'num_retweets', 'num_favorites', 'friends', 'followers', 'total_num_in_username', 'tweets_listed_count', 'num_statuses', 'is_verified', 'is_quote', 'is_reply', 'is_retweet', 'account_activeness', 'friend_ratio' and Y = isPolitical. For bot detection, X training values = 'description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites', 'user_followers_count', 'user_friends_count', 'num_of_tweets', 'user_statuses_count', 'user_favourites_count', 'user_has_default_photo', 'user_is_verified', 'listed_count', 'UsernameSpecialCharacterCount' and Y = isBot. '''

feature_explanations = ''' I created more features: is_verified, is_quote functions to see if the user is verified and the tweet is a quote tweet. I added more political words and names to political entity list. Furthermore, I defined is_reply, account_age, account_activeness, friends, followers, total_num_in_username, usernamespecialcharactercount, friend_ratio, and tweets_listed_count functions to detect various features and extract them. I extracted the number of URLs, symbols in the tweets to use along with mentions and hashtags. I also extracted the number of statuses of users to use it on bot detection part. For the user data, I extracted followers_count, friends_count, num_statuses and favorites_count of the users. I extracted features from the user metadata too. I defined user_has_default_photo function because most bots have the default twitter profile image. I used the number of retweets and favorites. I also defined a function to check the numbers in the username because it gives an idea about whether it's a bot account. '''

model_explanations = ''' I split the test set by 0.2 because it's the optimal ratio. Then I tried using various machine learning algorithms such as Decision Tree, grid search cv, AdaBoost, XGBoost, Linear Regression, Logistic Regression and so on. I got the highest score with XGBoost for political tweet detection and bot detection. This step involved selecting the appropriate algorithm to achieve the highest accuracy. I also tried and selected different hyperparameters such as learning rate while using some algorithms. After that, I evaluated the model on the validation set to get an idea of how well the model is able to generalize to new data. Whenever the model's performance is not satisfactory, I went back to the step of trying ML models and adjusting the parameters or choosing a different algorithm. I repeated until I got a high accuracy score. I performed grid search cross validation for hyperparameter optimization using XGBoost which is a model for binary classification. I defined the obejective as binary logistic. '''

additional_explanations = ''' I got the predictions as float numbers in the range [0, 1] to get more accurate predictions. I tried to analyze the data and add features as much as I can and I used the same way of thinking I used on the annotations to train a model and get a high accuracy score. Political Tweet Detection: 0.85 accuracy. Bot Detection: 0.88 '''


In [67]:
predictions = {
    'round': ROUND,
    'student_id': STUDENT_ID,
    'user_predictions': modelPredUser,
    'tweet_predictions': modelPredTweet,
    'explanations': {
        'data': data_explanations,
        'feature': feature_explanations,
        'model': model_explanations,
        'other': additional_explanations,
    }
}


with open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'w') as fl:
    fl.write(json.dumps(predictions, indent=4))

In [68]:
# Test your submission file

submission = json.load(open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'r'))
submission

{'round': 3,
 'student_id': '28368',
 'user_predictions': {'biologselim': 0.5778451561927795,
  'omerakdag34': 0.026165897026658058,
  'bilgin21604923': 0.39957481622695923,
  '_sydneycarton_': 0.030275676399469376,
  'denizlihabercom': 0.01461754459887743,
  'burakerbaychp': 0.0027628252282738686,
  'mvnez': 0.028512228280305862,
  'qara118': 0.015255783684551716,
  'nabiyonyevrum': 0.02010774053633213,
  'farukhalit2': 0.06682301312685013,
  'harlunoshi': 0.12569516897201538,
  'heritagepaix': 0.017760789021849632,
  'nuranwolf': 0.024941593408584595,
  'politikgundem': 0.18848557770252228,
  'isakethudax': 0.0036067632026970387,
  'enveraysevera': 0.07026198506355286,
  'ilaydejaneiro': 0.12816491723060608,
  '1905anason': 0.402643620967865,
  'eraydurgut03': 0.055439818650484085,
  'dasiskein': 0.0673208013176918,
  'ercan_bas29': 0.11211752146482468,
  'mett_1907': 0.026196470484137535,
  'ondemir066': 0.29389244318008423,
  'semihyeteer': 0.16124778985977173,
  'haberinyokcokk': 