# Word Count Analysis

In [4]:
#import libraries
import pandas as pd
from collections import Counter

In [2]:
#import dataframes
df_cty = pd.read_csv('df_cty_final.csv')
df_rb = pd.read_csv('df_rb_final.csv')
df_rock = pd.read_csv('df_rock_final.csv')

In [3]:
df_cty.head()

Unnamed: 0,track,artist,track_id,release_date,dance,energy,loud,speech,acoust,live,valence,tempo,sim_score,lyrics,words
0,Forever After All,Luke Combs,6IBcOGPsniK3Pso1wHIhew,2020-10-23,0.487,0.65,-5.195,0.0253,0.191,0.0933,0.456,151.964,-37.173907,[Verse 1]\nA cold beer's got twelve ounces\nA ...,"['cold', 'beer', 'got', 'twelve', 'ounce', 'go..."
1,The Good Ones,Gabby Barrett,3hLuHKzG1cmlRpq53ZVWd8,2020-06-19,0.519,0.552,-5.023,0.0259,0.18,0.149,0.331,89.957,-22.853968,[Verse 1]\nHe's a phone call to his parents\nH...,"['phone', 'call', 'parent', 'bible', 'bed', 't..."
2,Made For You,Jake Owen,7vF3xkCMvZjAe2nTWY0uQZ,2019-03-29,0.581,0.441,-6.829,0.0268,0.77,0.111,0.337,82.125,-30.690424,[Verse 1]\nWater towers are made for hearts an...,"['water', 'tower', 'made', 'heart', 'name', 'f..."
3,Breaking Up Was Easy In The 90's,Sam Hunt,4sf2L157iEgAR7yrCNLgSq,2020-04-03,0.562,0.649,-5.4,0.0494,0.231,0.341,0.376,145.913,-31.120918,"[Intro]\nYeah, man, oh man, oh man\n\n[Verse 1...","['yeah', 'man', 'oh', 'man', 'oh', 'man', 'bar..."
4,Singles You Up - Ryan Riback Remix,Jordan Davis,3clfDJ1mvmbSTg9jw40wan,2018-08-30,0.644,0.817,-6.118,0.0421,0.00363,0.193,0.656,115.94,-1.203636,[Verse 1]\nI ain't heard you laugh like that i...,"['heard', 'laugh', 'like', 'long', 'time', 'wo..."


### Functions

In [None]:
def word_count(word_list):
    '''
    function
    parameters:
        word_list-->list of word lists for each song in dataframe
    '''
    #initiate variables
    word_counts = Counter() #number of times a word appears in all songs
    appears_in = Counter() #number of songs in which a word appears
    total_docs = len(word_list) #total songs
    
    #iterate through lists of words in each song to update variables
    for l in word_list:
        word_counts.update(l)
        appears_in.update(set(l))
    
    #create new dataframe with word, count, and appears_in columns
    df = pd.DataFrame(data={'word':word_counts.keys(),
                            'count':word_counts.values(),
                            'appears_in':appears_in.values()})
    
    #add rank column (based on word count)
    df['rank'] = df['count'].rank(method='first', ascending=False)
    
    #add percent total column (based on total word count)
    total = df['count'].sum()
    df['pct_total'] = df['count'].apply(lambda x: x/total)
    
    #add cumulative percent total column
    df = df.sort_values(by='rank')
    df['cul_pct_total'] = df['pct_total'].cumsum()
    
    #add appears in percent column
    df['appears_in_pct'] = df['appears_in'].apply(lambda x: x/total_docs)
    
    return df.sort_values(by='rank')