# Lyrics classifier

## Load packages

In [None]:
#load "Paket.fsx"

Paket.Package 
  ["XPlot.Plotly"
   "MathNet.Numerics"
   "MathNet.Numerics.FSharp"
   "FSharp.Data"
   "Microsoft.ML"]
   
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
#load "Paket.Generated.Refs.fsx"

## Namespaces import and type definition

In [None]:
open System
open System.Linq
open System.IO
open MathNet.Numerics
open MathNet.Numerics.Distributions
open MathNet.Numerics.LinearAlgebra
open MathNet.Numerics.Random
open FSharp.Data



[<CLIMutable>]
type LyricsInput = 
    {
        Song : string
        Artist : string
        Genre : string
        Lyrics : string
        Year: int
    }
    

## Data cleanup

In [3]:
    let trainDataPath = Path.Combine("../","Data","lyrics.csv")   
    let msft = CsvFile.Load(File.Open(trainDataPath, FileMode.Open), separators = ",", quote = '"', hasHeaders= true)
    
    let songLyrics = 
           msft.Rows
           |> Seq.filter (fun row -> not(row.GetColumn "lyrics" |> String.IsNullOrEmpty))
           |> Seq.map (fun row -> {  Song = (row.GetColumn "song")
                                     Artist = (row.GetColumn "artist")
                                     Genre = (row.GetColumn "genre")
                                     Lyrics = (row.GetColumn "lyrics").Replace(Environment.NewLine, ", ")
                                     Year = (row.GetColumn "year") |> int
                                  })
                              

### Visualize cleaned data

In [4]:
songLyrics
    |> Seq.take 10
    |> Seq.iter(fun row -> Console.WriteLine(row))
       

{Song = "ego-remix";
 Artist = "beyonce-knowles";
 Genre = "Pop";
 Lyrics =
  "Oh baby, how you doing?, You know I'm gonna cut right to the chase, Some women were made but me, myself, I like to think that I was created for a special purpose, You know, what's more special than you? You feel me, It's on baby, let's get lost, You don't need to call into work 'cause you're the boss, For real, want you to show me how you feel, I consider myself lucky, that's a big deal, Why? Well, you got the key to my heart, But you ain't gonna need it, I'd rather you open up my body, And show me secrets, you didn't know was inside, No need for me to lie, It's too big, it's too wide, It's too strong, it won't fit, It's too much, it's too tough, He talk like this 'cause he can back it up, He got a big ego, such a huge ego, I love his big ego, it's too much, He walk like this 'cause he can back it up, Usually I'm humble, right now I don't choose, You can leave with me or you could have the blues, Some call i

## Chart analysis

In [5]:
open XPlot.Plotly

songLyrics
        |> Seq.map(fun row -> row.Genre)
        |> Seq.countBy id |> Seq.toList 
        |> Chart.Pie
        |> Chart.WithTitle "Dataset by Genre"
        |> Chart.WithLegend true

In [6]:
open XPlot.Plotly

songLyrics
        |> Seq.map(fun row -> row.Year)
        |> Seq.countBy id |> Seq.toList 
        |> Chart.Pie
        |> Chart.WithTitle "Dataset by Year"
        |> Chart.WithLegend true

In [9]:
open Microsoft.ML
open Microsoft.ML
open Microsoft.ML.Data
open Microsoft.ML.Transforms.Text

let stopwords = [|"ourselves"; "hers"; "between"; "yourself"; "but"; "again"; "there"; "about"; "once"; "during"; "out"; "very"; "having"; "with"; "they"; "own"; "an"; "be"; "some"; "for"; "do"; "its"; "yours"; "such"; "into"; "of"; "most"; "itself"; "other"; "off"; "is"; "s"; "am"; "or"; "who"; "as"; "from"; "him"; "each"; "the"; "themselves"; "until"; "below"; "are"; "we"; "these"; "your"; "his"; "through"; "don"; "nor"; "me"; "were"; "her"; "more"; "himself"; "this"; "down"; "should"; "our"; "their"; "while"; "above"; "both"; "up"; "to"; "ours"; "had"; "she"; "all"; "no"; "when"; "at"; "any"; "before"; "them"; "same"; "and"; "been"; "have"; "in"; "will"; "on"; "does"; "yourselves"; "then"; "that"; "because"; "what"; "over"; "why"; "so"; "can"; "did"; "not"; "now"; "under"; "he"; "you"; "herself"; "has"; "just"; "where"; "too"; "only"; "myself"; "which"; "those"; "i"; "after"; "few"; "whom"; "t";"ll"; "being"; "if"; "theirs"; "my"; "against"; "a"; "by"; "doing"; "it"; "how"; "further"; "was"; "here"; "than"; "s"; "t"; "m"; "'re"; "'ll";"ve";"..."; "ä±"; "''"; "``"; "--"; "'d"; "el"; "la"; "que"; "y"; "de"; "en"|]
let symbols = [|'\''; ' '; ','|]
            
let renderLineChartForWords(words: seq<string>) = 
            words
                |> Seq.countBy id 
                |> Seq.sortByDescending(fun (value:string, count :int) -> count)
                |> Seq.take 15
                |> Chart.Line
                        
let tokenizeLyrics (lyrics: seq<LyricsInput>) =
            let mlContext = MLContext(seed = Nullable 0)        
            let data = mlContext.Data.LoadFromEnumerable lyrics
            
            let pipeline = mlContext.Transforms.Text.FeaturizeText("FeaturizedLyrics", "Lyrics")
                                            .Append(mlContext.Transforms.Text.NormalizeText("NormalizedLyrics", "Lyrics"))
                                            .Append(mlContext.Transforms.Text.TokenizeWords("TokenizedLyric", "NormalizedLyrics", symbols))
                                            .Append(mlContext.Transforms.Text.RemoveStopWords("LyricsWithNoCustomStopWords", "TokenizedLyric", stopwords))
                                            .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("LyricsWithNoStopWords", "LyricsWithNoCustomStopWords"))

            let transformedData = pipeline.Fit(data).Transform(data)
            transformedData.GetColumn<string[]>(mlContext, "LyricsWithNoStopWords")
                        |> Seq.concat
                        |> Seq.toList


### Words by genre - Rock

In [10]:
let filteredLyrics = songLyrics |> Seq.filter(fun row -> row.Genre = "Rock" )
tokenizeLyrics filteredLyrics
        |> renderLineChartForWords

### Words by genre - Hip-hop

In [11]:
let filteredLyrics = songLyrics |> Seq.filter(fun row -> row.Genre = "Hip-Hop" )
tokenizeLyrics filteredLyrics
        |> renderLineChartForWords

### Words by genre - Pop

In [12]:
let filteredLyrics = songLyrics |> Seq.filter(fun row -> row.Genre = "Pop" )
tokenizeLyrics filteredLyrics
        |> renderLineChartForWords

### Words by genre - Jazz

In [None]:
let filteredLyrics = songLyrics |> Seq.filter(fun row -> row.Genre = "Jazz" )
tokenizeLyrics filteredLyrics
        |> renderLineChartForWords