### Task 2 : Top 10 hash tags and users based on their number of tweets in your data set.

#### Imports

In [1]:
# Imports
import pandas as pd

#### Utility functions

In [2]:
# Add a key to a dictionary if it doesn't exist yet
def addToDict(hashtags, hash):
    if (hash in hashtags):
        hashtags[hash] += 1
    else:
        hashtags[hash] = 1

# Parse a tweet and add the hashtags to the hashtags dictionary
def parse_tweet_for_hashtags(hashtags, tweet):
    
    # List of characters that ends a hashtag
    invalid_chars = [',', '.', '!', '?', ':', ';', '"', '#', '\'', '\\', '/', '|', '@', '$', '%', '^', '&', '*', '(', ')', '_', '+', '=', '{', '}', '[', ']', '<', '>', '~', '`', ' ', '\n', '\t', '\xa0']
    
    parsing_hashtag = False # State value to know if we are currently parsing a hashtag or not
    hashtag = "" # The hashtag we are parsing
    for letter in tweet:

        # Begin parsing a hashtag
        if letter == "#":
            parsing_hashtag = True
            hashtag += letter

        # End parsing a hashtag
        elif parsing_hashtag and letter in invalid_chars:
            parsing_hashtag = False
            addToDict(hashtags, hashtag)
            hashtag = ""

        # Continue parsing a hashtag
        elif parsing_hashtag:
            hashtag += letter

        # Not parsing a hashtag
        else:
            continue

    # If the tweet ended with a hashtag, add it to the hashtags dictionary
    if parsing_hashtag:
        addToDict(hashtags, hashtag)

# Return a dictionary with the hashtags and their counts
def get_hashtags(datasets):

    # Create a dictionary to store the hashtags and their counts
    hashtags = {} 

    for dataset in datasets: # For each dataset

        # Read the tweets file
        tweets_df = pd.read_csv("data/" + dataset + "/tweets.csv")

        for tweet in tweets_df['text']: # For each tweet
            parse_tweet_for_hashtags(hashtags, tweet)
    
    return hashtags

# Fill a dictionary with the number of tweets per author
def get_author_ids(datasets):
    authors = {} # Create a dictionary to store the authors and their counts
    for dataset in datasets:
        tweets_df = pd.read_csv("data/" + dataset + "/tweets.csv")
        for author in tweets_df['author_id']:
            addToDict(authors, author)
    return authors

# Return the author with the given id
def author_by_id(id, datasets):
    for dataset in datasets:
        authors_df = pd.read_csv("data/" + dataset + "/authors.csv")
        author = authors_df[authors_df['id'] == id]
        if len(author) > 0:
            return author
            
    return None

# Return the top 10 keys and values in a dictionary, sorted by value
def top10(dict):
    sorted_dict = sorted(dict.items(), key=lambda x: x[1], reverse=True)
    return sorted_dict[:10]

#### Main functions

In [3]:
# Print the top 10 hashtags in the given datasets
def print_top_10_hashtags(datasets):

    # Get the hashtags and their counts
    hashtags = get_hashtags(datasets)

    # Get the top 10 hashtags
    top = top10(hashtags)

    # Print the top 10 hashtags
    print("Top 10 hashtags:")
    for i in range(10):
        print(1 + i, ":", top[i][0], ":", top[i][1])

# Print the top 10 authors in the given datasets
def print_top_10_authors(datasets):

    # Get the authors and their counts
    authors_id = get_author_ids(datasets)

    # Get the top 10 authors
    top = top10(authors_id)

    # Print the top 10 authors
    print("Top 10 authors:")
    for i in range(10):
        author = author_by_id(top[i][0], datasets)
        print(1 + i, " : ", author['name'].values[0], " (@", author['username'].values[0], ") : ", top[i][1], sep="")

#### Exemple usage

In [4]:
# Get the hashtags and their counts
datasets = ["Paris"]
print_top_10_hashtags(datasets)
print("\n")
print_top_10_authors(datasets)

Top 10 hashtags:
1 : #Paris : 1003
2 : #paris : 484
3 : #France : 170
4 : #london : 90
5 : #france : 80
6 : #Marseille : 80
7 : #Toulouse : 76
8 : #DLP : 75
9 : #Berlin : 67
10 : #London : 65


Top 10 authors:
1 : DLP Stats (@DlpStats) : 56
2 : Nabaat (@Nahbaat) : 51
3 : Paris d'autrefois (@ParisAntan) : 42
4 : Radio75.org // Xv3radio18.com (@Xv3Radio75) : 29
5 : SmoothJazzNola (@smoothjazznola) : 28
6 : Chicago Secret Society (@Chicago________) : 19
7 : Valurank (@valurank) : 16
8 : Dimitra Kourkoulakou (@dkourkoulakou) : 16
9 : Paris Liebe (@liebe_paris) : 15
10 : Doffou Radio Bordeaux (@DoffouRadio) : 12
