# Word Count Analysis

In [1]:
#import libraries
import pandas as pd
from collections import Counter
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import altair as alt

In [2]:
#import dataframes
df_cty = pd.read_csv('df_cty_final.csv',converters={'words':eval})
df_rb = pd.read_csv('df_rb_final.csv',converters={'words':eval})
df_rock = pd.read_csv('df_rock_final.csv',converters={'words':eval})

### Functions

In [3]:
def word_count(word_list):
    '''
    ***REPLACED WITH TF-IDF CALCULATIONS (see below functions)***
    function returns dataframe with word count information
    parameter:
        word_list-->list of word lists for each song in dataframe
    '''
    #initiate Counter object
    word_counts = Counter() #number of times a word appears in all songs
    
    #iterate through lists of words in each song to update variable
    for l in word_list:
        word_counts.update(l)
    
    #create new dataframe with word and count columns
    df = pd.DataFrame(data={'word':word_counts.keys(),
                            'count':word_counts.values()})
    
    #add percent total column (based on total word count)
    df['count_pct'] = df['count'].apply(lambda x: x/df['count'].sum())
    
    return df.sort_values(by='count',ascending=False)

In [57]:
def calc_tf(song_list):
    '''
    function returns dataframe with term frequencies for song list
    parameter:
        song_list-->list of lists of words in each song
            *['words'] column in genre dataframe
    '''
    corpus = set([word for song in song_list for word in song]) #set of all words in dataframe
    
    dict_list = [] #variable to hold dictionaries of word counts for each song
    
    #create dictionary with term frequencies for each song
    for song in song_list:
        d = dict.fromkeys(corpus,0)
        for word in song:
            d[word]+=1 #update word count
        d = {k:v/len(song) for k,v in d.items()} #normalize word counts
        dict_list.append(d) #add dictionary to dict_list
    
    #create dataframe with term frequencies
    df=pd.DataFrame(dict_list)
    
    return df

In [None]:
def calc_idf(x):
    return None

In [58]:
calc_tf(df_cty['words'])['cold']

0       0.006211
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
1010    0.000000
1011    0.000000
1012    0.004651
1013    0.000000
1014    0.000000
Name: cold, Length: 1015, dtype: float64

In [37]:
x['cold']

0       1
1       0
2       0
3       0
4       0
       ..
1010    0
1011    0
1012    1
1013    0
1014    0
Name: cold, Length: 1015, dtype: int64

In [5]:
def remove_num(text):
    '''
    function returns text with numbers removed
    parameter:
        text-->str
    '''
    for word in text:
        try:
            int(word)
            text.remove(word)
        except:
            pass
    return text

### Create Dataframes with Word Counts

In [6]:
#remove numbers from dataframs
df_cty['words'] = df_cty['words'].apply(remove_num)
df_rb['words'] = df_rb['words'].apply(remove_num)
df_rock['words'] = df_rock['words'].apply(remove_num)

In [8]:
#country songs - top 5 words
df_cty_count.head(10)

Unnamed: 0,word,count,count_pct
52,like,3485,0.011275
78,know,2785,0.009011
77,yeah,2538,0.008212
51,love,2331,0.007542
82,one,2194,0.007099
2,got,1975,0.00639
11,get,1872,0.006057
113,oh,1790,0.005791
29,time,1729,0.005594
206,go,1597,0.005167


In [14]:
def norm_count(df,other_dfs):
    '''
    function adds column with normalized word count
        *uses TF-IDF
        *# of times word appears in this df/# of dfs in which word appears
    parameters:
        df-->dataframe to amend
        other_dfs-->list of dataframes for comparison
    '''
    norm_list = [] #initiate list for normalized word counts
    
    #divide word count by # of dataframes in which word appears
    for i in range(len(df)):
        count=1
        for d in other_dfs:
            count+=len(d[d['word']==df.iloc[i]['word']])
            #for j in range(len(d)):
             #   if df.iloc[i]['word'] == d.iloc[j]['word']:
                    #count+=1
            #if d['word'].str.contains('Mel').any()
            
            #
        norm_list.append(df.iloc[i]['count']/count)
        
    df['norm_count'] = norm_list
    
    return df

In [15]:
df_cty_count = norm_count(df_cty_count,[df_rb_count,df_rock_count])

In [17]:
df_cty_count.sort_values('norm_count',ascending=False)

Unnamed: 0,word,count,count_pct,norm_count
52,like,3485,0.011275,1161.666667
78,know,2785,0.009011,928.333333
77,yeah,2538,0.008212,846.000000
51,love,2331,0.007542,777.000000
82,one,2194,0.007099,731.333333
...,...,...,...,...
16005,tiffany,1,0.000003,0.333333
15171,gnash,1,0.000003,0.333333
26220,84,1,0.000003,0.333333
16011,maximillian,1,0.000003,0.333333


### Generate Word Clouds

In [None]:
#initialize word cloud generator
wc = WordCloud(max_words=100,background_color='white')

In [None]:
#country


#word cloud for country songs
#dictionary to generate word cloud
cty_cloud_data = dict(zip(df_cty_count['word'].tolist(), df_cty_count['count'].tolist()))

#generate word cloud
plt.figure(figsize=(10,8))
#cty_cloud = wc.generate_from_frequencies(cty_cloud_data)
plt.imshow(wc.generate_from_frequencies(cty_cloud_data),interpolation="bilinear")
plt.axis("off")
plt.show()
#plt.savefig('my_fig.png')
#plt.close()


### Generate Histograms of Top 10 Words

In [None]:
plt.figure(figsize=(8,8))
cty_bar = plt.bar(df_cty_count.iloc[0:10]['word'],df_cty_count.iloc[0:10]['count'])

In [None]:
#plt.figure(figsize = (12, 8))
#plt.imshow(x, interpolation="bilinear")
#plt.axis("off")
#plt.show()

### Save Images to File

In [None]:

rb_cloud_data = dict(zip(df_rb_count['word'].tolist(), df_rb_count['count'].tolist()))

#generate word cloud
plt.figure(figsize=(10,8))
wc.generate_from_frequencies(rb_cloud_data)
plt.imshow(x, interpolation="bilinear")
plt.axis("off")
#plt.savefig('my_fig.png')
plt.close()