## 3. make conversation list
Create conversation pickle file that contains all conversation according to definitation  
**input**: database  
**output**: conversation pickle file

In [None]:
import pandas as pd
import sqlite3
import time
import pickle
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
airlines = {56377143: 'KLM', 106062176: 'Air France', 18332190: 'British Airways', 22536055: 'American Airlines', 
            124476322: 'Lufthansa', 26223583: 'Air Berlin', 2182373406: 'Air Berlin Assist', 38676903: 'easyJet', 
            1542862735: 'Ryanair', 253340062: 'Singapore Airlines', 218730857: 'Qantas',
            45621423: 'Etihad Airways', 20626359: 'Virgin Atlantic'}

# File you want to get the data from. Should be a database.
file = 'core.db'

In [None]:
def LoadDatabase(file, all_tweets):
    '''in: database file (.db), all_tweets (boolean, True of False)
      out: a dataframe with either all tweets, or tweets that are replies
      
    This function loads a database into a Pandas DataFrame.
    With this function, you can choose whether you want a Pandas dataframe with all tweets from the database,
    or only the tweets that are replies. It returns only the necessary variables from the tweets. 
    '''
    stamp1 = time.time()
    conn = sqlite3.connect(file)

    query_all = '''
    select id, created_at, text
    from twitter
    '''
    
    query_replies = '''
    select id, user_id, in_reply_to_status_id, in_reply_to_user_id, created_at, text
    from twitter
    where in_reply_to_status_id IS NOT NULL AND in_reply_to_status_id != ""
    '''
    
    if all_tweets:
        dataframe = pd.read_sql(query_all, conn)
    else:
        dataframe = pd.read_sql(query_replies, conn)
    
    dataframe.set_index('id', inplace=True)
    
    stamp2 = time.time()
    print('{:>7.3f} (read in database)'.format(stamp2-stamp1))
    
    return dataframe

In [None]:
def BinarySearch(alist, item):
    '''in: a list of numbers, the number to find (in our case, 'in_reply_to_status_id')
      out: whether the number has been found (boolean). If yes: the index of that number
      
    This function performs binary search on an ordered list to find an element in O(log n) time.
    Please note that the elements of the list are conversations, which exist of tweets. The functions
    looks at the first element of every tweet.
    '''
    first = 0
    last = len(alist)-1
    found = False

    while first<=last and not found:
        midpoint = (first + last)//2
        if alist[midpoint][0][0] == item:
            found = True
        else:
            if item < alist[midpoint][0][0]:
                last = midpoint-1
            else:
                first = midpoint+1

    return found, midpoint

In [None]:
def MakeConversations(tweets, *lowest_tweet_id):
    '''in: a dataframe
      out: list of basic 'conversations'
      
    The iterator goes in chronological order. When it finds a tweet that mentions another, it will add the following to 
    the conversations list: [(current_tweet_id, current_user_id, created_at, text), (mentioned_tweet_id, mentioned_used_id)]
    or as an example: [(34839, 12, 23th May 12:34, 'your customer service is bad'), (32322, 14)]
    
    When the Binary Search function finds the mentioned tweet, it inserts the following information to a list described
    above at the front of the conversation: (current_tweet_id, current_user_id, created_at, text). 
    For example, the earlier mentioned list now becomes:
    [(35997, 124476322, 23th May 12:51, 'Thank you for your message, we are aware of that.'), 
     (34839, 12, 23th May 12:34, 'your customer service is bad'), (32322, 14)]
    
    This only holds if the id of the mentioned tweet is HIGHER than the LOWEST tweet id in a given set. Because why look for
    tweet 5429 when the lowest in your set is 9000. Also note that the mentioned tweet lacks information. We will look
    up this information in another function. In every conversation, there will be exactly one tweet lacking this information.
    '''
    stamp1 = time.time()
    
    if not lowest_tweet_id:
        lowest_tweet_id = int(tweets.head(n=1).index.values[0])
        
    convos = []

    for tweet_id, tweet in tweets.iterrows():
        if tweet['in_reply_to_status_id'] >= lowest_tweet_id:
            if len(convos) == 0:
                convos.append([(tweet_id, tweet['user_id'], tweet['created_at'], tweet['text']), 
                               (tweet['in_reply_to_status_id'], tweet['in_reply_to_user_id'])])
            else:
                found, midpoint = BinarySearch(convos, tweet['in_reply_to_status_id'])

                if found:
                    convos[midpoint].insert(0, (tweet_id, tweet['user_id'], tweet['created_at'], tweet['text']))
                    convos.append(convos[midpoint])
                    del convos[midpoint]

                else:
                    convos.append([(tweet_id, tweet['user_id'], tweet['created_at'], tweet['text']),
                                   (tweet['in_reply_to_status_id'], tweet['in_reply_to_user_id'])])
    stamp2 = time.time()
    print('{:>7.3f} (make conversations list)'.format(stamp2-stamp1))

    return convos

In [None]:
def FilterConversations(all_conversations):
    '''in: a list of unprocessed 'conversations' (see MakeConversations() function)
      out: a list of processed conversations
      
    This function processes all unprocessed conversations. With unprocessed, we mean all conversations of length bigger than 
    or equal to 2. It first checks whether the conversation has at least length 3, as we have defined them to be possible
    conversations. Then, it checks the rule we have defined for a conversations to be an actual conversation.
    
    Rule: if there is an airline somewhere in the middle part, we consider the whole list as a conversation, e.g.;
    [some other stuff]* ([some user] [airline] [some user])^1 [some other stuff]*
    Note that only the middle part is absolutely necessary.
    '''
    stamp1 = time.time()
    conversations_with_airlines = []

    for conversation in all_conversations:
        if len(conversation) > 2:
            # It takes the middle part, so it trims the first and last tweet off
            middle = conversation[1:-1]
            # Check every tweet in "middle" part
            for tweet_id in middle:
                # Check if user is an airline
                if int(tweet_id[1]) in airlines.keys():
                    conversations_with_airlines.append(conversation)
                    break

    stamp2 = time.time()
    print('{:>7.3f} (filter conversations list)'.format(stamp2-stamp1))
    return conversations_with_airlines

In [None]:
def CompleteTexts(conversations, all_tweets):
    '''in: a list of filtered conversations, a dataframe of all the tweets from a database
      out: a list of completed filtered conversations
      
    In the MakeConversations() functions, we talked about tweets that don't contain full information. Well, this function
    retrieves that information and completes all the conversations.
    '''
    stamp1 = time.time()
    to_delete = []

    for con in range(len(conversations)):
        for tweet in range(len(conversations[con])):
            if len(conversations[con][tweet]) == 2:
                try:
                    tweet_info = all_tweets.loc[conversations[con][tweet][0]]
                    text = tweet_info['text']
                    created_at = tweet_info['created_at']
                    conversations[con][tweet] = conversations[con][tweet] + (created_at, text)
                # It deletes the conversation if the oldest tweet can't be found
                except KeyError:
                    to_delete.append(con)
                    break
                    
    for index in sorted(to_delete, reverse=True):
        del conversations[index]
        
    stamp2 = time.time()
    print('{:>7.3f} (complete texts)'.format(stamp2-stamp1))
    
    return conversations

In [None]:
def AddScores(conversations):
    '''in: a list of completed filtered conversations
      out: a list of completed filtered conversations with sentiment scores for all tweets
      
    This function uses the Vader package to add scores to all the tweets that are in conversations.
    '''
    stamp1 = time.time()
    analyzer = SentimentIntensityAnalyzer()
    
    for conversation_index in range(len(conversations)):
        for tweet_index in range(len(conversations[conversation_index])):
            score = analyzer.polarity_scores(conversations[conversation_index][tweet_index][3])['compound']
            conversations[conversation_index][tweet_index] = conversations[conversation_index][tweet_index] + (score,)

    stamp2 = time.time()
    print('{:>7.3f} (adding scores to {} conversations)'.format(stamp2 - stamp1, len(conversations)))
    
    return conversations

In [None]:
def Combine(file):
    '''in: a database file (.db)
      out: a list of conversations with sentiment scores
    
    Bring everything together so you just need to input a database file.
    
    This whole thing takes approximately 10 minutes to run for the whole database, on a brick-like laptop that is arguably
    fit to do computational stuff.
    '''
    stamp1 = time.time()
    
    all_tweets = LoadDatabase(file, True)
    reply_tweets = LoadDatabase(file, False)
    
    all_conversations = MakeConversations(reply_tweets)
    filtered_conversations = FilterConversations(all_conversations)
    complete_conversations = CompleteTexts(filtered_conversations, all_tweets)
    conversations_with_scores = AddScores(complete_conversations)
    
    stamp2 = time.time()
    print('{:>7.3f} (total time)\n'.format(stamp2 - stamp1))
    
    return conversations_with_scores

In [None]:
# Let's hope it works
conversations_with_scores = Combine(file)

In [None]:
# Store the conversations list in a pickle file in the obj folder
pickle.dump(conversations_with_scores, open("obj/conversations_with_scores.p", "wb"))

In [None]:
print('Done')