# Ngram Analysis

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from py4tfidf.vectorizer import Tfidf

#### Importing data

In [3]:
df_cty = pd.read_csv('../Data/df_for_cty_ngrams.csv').fillna('NA')
df_rock = pd.read_csv('../Data/df_for_rock_ngrams.csv').fillna('NA')
df_rb = pd.read_csv('../Data/df_for_rb_ngrams').fillna('NA')

## Cleaning data

In [46]:
def remove_stop(line):
    #WORD_RE = re.compile(r"[A-Za-z0-9_\-]+")
    WORD_RE = re.compile(r'\b\S+\b')
    token = WORD_RE.findall(line)
    tokens = [w.lower().replace('\\u',' ').replace("205f",'').replace("2005",'').replace("\\",'') for w in token if w.lower()]
    return tokens

In [165]:
def get_cleaned_lines(df):  
    all_list = []
    for column in df:
        for corpus in df[column]:
            x = remove_stop(corpus.strip(']['))
            all_list.append(x)
    return all_list

## TfidfVectorizer

- Tfidf stands fro Term frequency * inverse document frequency
- Helps standardize if a  word/phrase shows up alot in one document, causing the frequency to be inflated, making it seem more important than it is
- Ngram range (3,5): I looked at ngrams of size 3,4,5
- Sorted by tfidf value

In [456]:
def final_output(words):
    final = []
    for i in words:
        song = []
        for j in i:
            temp = []
            tokens = j.split(' ')
            if len(tokens) > 1:
                for item in tokens:
                    temp.append(item)
            else:
                temp.append(j)
            song.extend(temp)
        final.append(song)
    text = [" ".join(i)for i in final]
    
    
    vectorizer = TfidfVectorizer(ngram_range = (3,5), min_df =5)
    vectorizer.fit(text)
    
    vector = vectorizer.transform([text[0]])
    counts = vectorizer.vocabulary_
    idfs = vectorizer.idf_
    
    output = pd.DataFrame(zip(list(counts.keys()) , idfs), columns = ['phrase','idf'])
    
    count_df = pd.DataFrame(zip(list(counts.keys()), list(counts.values())), columns = ['phrase', 'counts'])
                        
                    
    data = pd.merge(output , count_df)
    data['tfidf'] = data.idf * data.counts
    data['length'] = [len(i.split(' ')) for i in data.phrase]
    
    #data_two = data[data.length == 2].sort_values('tfidf', ascending = False).head(5)
    data_three = data[data.length == 3].sort_values('tfidf', ascending = False).head(5)
    data_four = data[data.length == 4].sort_values('tfidf', ascending = False).head(5)
    data_five = data[data.length == 5].sort_values('tfidf', ascending = False).head(5)
    
    data = pd.concat((data_three , data_four , data_five))
    
    
    
    
    return data

### RB output for Viz

In [465]:
features_for_analysis = ['Intro', 'Chorus' ,'Verse 1', 'Chorus', 'Verse 2' ,'Chorus']


df_rb_last = pd.DataFrame(columns = ['phrase','idf','tfidf','length'])
for i in features_for_analysis:
    words = get_cleaned_lines(df_rb[[i]])
    temp = final_output(words)
    temp['Song_Part'] = i
    df_rb_last = pd.concat((df_rb_last , temp))
df_rb_last = df_rb_last.reset_index(drop = True)

### Rock output for Viz

In [471]:
features_for_analysis =['Verse 1', 'Pre-Chorus' ,'Chorus', 'Verse 2' ,'Pre-Chorus' ,'Chorus' ,'Bridge' ,'Chorus']

df_rock_last = pd.DataFrame(columns = ['phrase','idf','tfidf','length'])
for i in features_for_analysis:
    words = get_cleaned_lines(df_rock[[i]])
    temp = final_output(words)
    temp['Song_Part'] = i
    df_rock_last = pd.concat((df_rock_last , temp))
df_rock_last = df_rock_last.reset_index(drop = True)

### Country output for Viz

In [466]:
features_for_analysis = ['Verse 1' ,'Chorus', 'Verse 2' ,'Chorus', 'Bridge' ,'Chorus']


df_cty_last = pd.DataFrame(columns = ['phrase','idf','tfidf','length'])
for i in features_for_analysis:
    words = get_cleaned_lines(df_cty[[i]])
    temp = final_output(words)
    temp['Song_Part'] = i
    df_cty_last = pd.concat((df_cty_last , temp))
df_cty_last = df_cty_last.reset_index(drop = True)

#### Exporting to CSV

In [None]:
df_rb_last.to_csv('../Data/ngrams_df_rb.csv', index = False)
df_rock_last.to_csv('../Data/ngrams_df_rock.csv', index = False)
df_cty_last.to_csv('../Data/ngrams_df_cty.csv', index = False)