# Functions
These are all the functions that will be used throughout the project.

In [1]:
def load_album_json(album_names_list, artist_name, dir_path, ori_path): 
    os.chdir(dir_path)
    for alb_name in album_names_list:
        alb_obj = genius.search_album(alb_name, artist_name)
        alb_date = str(alb_obj.release_date_components.year) + " " + str(alb_obj.release_date_components.month) + " " + str(alb_obj.release_date_components.day)
        alb_obj.save_lyrics(alb_date + " " + alb_name, overwrite=True)
        print('This album has been successfully loaded:', alb_name)
    os.chdir(ori_path)

In [2]:
def album_list_maker(album_json_list, artist):
    final_album_list = list()
    for album_json in album_json_list:
        album_list = list()
        for song_number in range(len(album_json['tracks'])):
            album_title = album_json['name']
            song_title = album_json['tracks'][song_number]['song']['title']
            lyrics = genius.search_song(song_title, artist)
            
            album_dict = {
                'album_title' : album_title,
                'song_title' : song_title,
                'lyrics' : lyrics.lyrics.strip()
            }
            
            album_list.append(album_dict)
        final_album_list.append(album_list)
        print("The album was successfully added to the album list:", album_title)
    
    return final_album_list

In [3]:
def load_bbt100_json(year, dir_path, ori_path):
    chart_url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_' + year
    
    resp = requests.get(chart_url)
    html = resp.text
    
    doc = BeautifulSoup(html, 'html.parser')
    table = doc.find('table')
    table_rows = table.find('tbody').find_all('tr')
    
    year_chart = list()
    
    for row in range(1, len(table_rows)):
        row_info = table_rows[row].get_text().strip().split('\n')
        rank = row_info[0]
        title = row_info[1].strip('""')
        artist = row_info[2]
        
        each_song = {
            'rank': rank,
            'title': title,
            'artist': artist
        }
        
        year_chart.append(each_song)
        
    int_year = int(year)
    fname = 'data/mainstream_music_bbt100/{}_top100_chart_data.json'.format(int_year)
    with open(fname, 'w', encoding='utf-8') as out:
        out.write(json.dumps(year_chart, indent=4))
        
    print("This function is done!", '{}_top100_chart_data.json was made'.format(int_year))
    

In [4]:
def chart_list_maker(chart_json):
    top_chart_final = list()
    year_chart = list()
    error_processing = list()
    for index, item in enumerate(chart_json[index]):
        song = genius.search_song(item['title'], item['artist'])
        if song:
            song_dict = {
                'rank' : item['rank'],
                'title' : item['title'],
                'artist' : item['artist'],
                'lyrics' : song.lyrics
            }
            year_chart.append(song_dict)
        else:
            error_dict = {
                'rank': item['rank'],
                'title': item['title'],
                'artist': item['artist'],
                'error_number' : index
            }            
            error_processing.append(error_dict)
        top_chart_final.append(year_chart)
    return top_chart_final

In [5]:
def get_cleaned_song_all(all_album, token_ctr, song_freq_ctr, ngram_number):
    cleaned_song_lyrics_all = list()
    for song in all_album:
        strip_song_lyrics = song.strip('\n')
        token_lyrics = tokenize(strip_song_lyrics, lowercase=True, strip_chars='().[]!,"-')
        ngram_lyrics = get_ngram_tokens(token_lyrics, n=ngram_number)
        cleaned_song_lyrics_all.extend(ngram_lyrics)
        token_ctr.update(ngram_lyrics)
        unique_tokens = set(ngram_lyrics)
        song_freq_ctr.update(unique_tokens)
    
    return cleaned_song_lyrics_all

In [6]:
def get_cleaned_album_all(album, token_ctr, song_freq_ctr, ngram_number):
    cleaned_song_lyrics = list()
    for song_lyrics in album:
        stripped_lyrics = song_lyrics.strip('\n')
        token_lyrics = tokenize(stripped_lyrics, lowercase=True, strip_chars=characters_to_strip)
        ngram_lyrics = get_ngram_tokens(token_lyrics, n=ngram_number)
        cleaned_song_lyrics.extend(ngram_lyrics)
        token_ctr.update(ngram_lyrics)
        unique_tokens = set(ngram_lyrics)
        song_freq_ctr.update(unique_tokens)
        
    return cleaned_song_lyrics

In [7]:
def get_cleaned_chart_tokens(year_chart, token_ctr, song_freq_ctr, ngram_number):
    year_token_list = list()
    for song_dict in year_chart:
        lyrics_list = song_dict['lyrics']
        stripped_lyrics = lyrics_list.strip('\n')
        token_lyrics = tokenize(stripped_lyrics, lowercase=True, strip_chars=characters_to_strip)
        ngram_lyrics = get_ngram_tokens(token_lyrics, n=ngram_number)
        year_token_list.extend(ngram_lyrics)
        token_ctr.update(ngram_lyrics)
        unique_tokens = set(ngram_lyrics)
        song_freq_ctr.update(unique_tokens)
    
    return year_token_list

In [8]:
def tokenize(text, lowercase=False, strip_chars=''):
    '''create a list of tokens from a string by splitting on whitespace and applying optional normalization 
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- should text string be normalized as lowercase (default: False)
        strip_chars -- a string indicating characters to strip out of text, e.g. punctuation (default: empty string) 
        
    Return:
        A list of tokens
    '''
    
    # create a replacement dictionary from the
    # string of characters in the **strip_chars**
    rdict = str.maketrans('','',strip_chars)
    
    if lowercase:
        text = text.lower()
    
    tokens = text.translate(rdict).split()
    
    return tokens

In [9]:
def get_ngram_tokens(tokens, n=1):
    '''create a list of n-gram tokens from a list of tokens
    
    Args:
        tokens -- a list of tokens
        n      -- the size of the window to use to build n-gram token list
        
    Returns:
        
        list of n-gram strings (whitespace separated) of length n
    '''
    
    if n<2 or n>len(tokens):
        return tokens
    
    new_tokens = []
    
    for i in range(len(tokens)-n+1):
        new_tokens.append(" ".join(tokens[i:i+n]))
        
    return new_tokens

In [12]:
def counter_without_stopwords_list(alb_ctr):
    clean_list = list()
    for word, num in alb_ctr.items():
        if word not in stopwords.words('english'):
            clean_list.append((word, num))
    
    clean_list.sort(key=lambda x:x[1], reverse=True)
    
    return clean_list

In [27]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each