# IMP Book chapter

SAFE descriptors...

In [1]:
from SAFEData2ACM import *

import numpy as np
from matplotlib import colors
from tabulate import tabulate
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from collections import Counter
from operator import itemgetter
import os
import csv
%matplotlib inline                       
import matplotlib.pylab as pylab
import re




%load_ext autoreload
%autoreload 2

## Section 2a: get some general statistics from the dataset...

In [2]:
# init the class...
folder='data/'
safe = SAFEData(folder, removeJibberish=False)

# number of instances...
plugDist = [len([1 for i in safe.instances if i.effectType[0] == plugin]) for plugin in safe.plugInNames]
for i in range(len(safe.plugInNames)):
    print safe.plugInNames[i], ': ', plugDist[i]
print 'Total:', safe.numInstances, 'instances\n---'

# number of uniques terms and users...
print 'Unique Terms:', len(safe.uniqueTerms) 
print 'Total Users:', len(np.unique([i.ip for i in safe.instances]))
print 'Terms per User:', float(len(safe.uniqueTerms))/len(np.unique([i.ip for i in safe.instances])) 

Compressor :  454
Distortion :  303
EQ :  1679
Reverb :  258
Total: 2694 instances
---
Unique Terms: 623
Total Users: 263
Terms per User: 2.36882129278


## Section 2b: more statistics
- Then get the top N descriptors, ranked by numTerms, confidence, popularity and Generality

In [3]:
# Load all the terms into descriptor objects, this currently takes around 30 seconds...
import time
t1 =time.time()
descriptors = safe.loadDescriptors(mute=True, )
t2 = time.time()
print '(finished in', t2-t1, 'seconds)'

# sort the terms in the dataset by their number of entries, confidence and generality scores...
terms = safe.uniqueTerms
entries = [d.numInstances for d in descriptors]
pluginDists = [d.getPlugInDistribution for d in descriptors]

# remove terms with less than T entries and sort
T = 4
entries_s = np.vstack([[terms[i], entries[i]] for i in sorted(range(0, len(terms)), key=lambda k: entries[k], reverse=True) if terms[i] and entries[i] > T])
entries_s = np.array([i for i in entries_s if 'test' not in i[0] and 'and' not in i[0] and 'the' not in i[0]]).transpose()

# calculate the distribution across the plugins for each term...
plugInNames = ['Compressor', 'Distortion', 'EQ', 'Reverb']
dist = np.array([[len([1 for inst in safe.instances if inst.effectType[0] == plugin and term in inst.terms]) for plugin in plugInNames] for term in entries_s[0]]).transpose()

M,N = np.shape(entries_s)

# print tabulate(np.vstack( [np.array(range(N))+1, entries_s[:N, :], dist[:N,:]]).transpose(), headers=['N', 'term', 'Total', 'Comp', 'Dist', 'EQ', 'Rev'], floatfmt=".3f", tablefmt="latex")
print tabulate(np.vstack( [entries_s[:N, :], dist[:N,:]]).transpose(), headers=['term', 'Total', 'Comp', 'Dist', 'EQ', 'Rev'], floatfmt=".3f", tablefmt="latex")
                                                                                

(finished in 25.2750160694 seconds)
\begin{tabular}{lrrrrr}
\hline
 term          &   Total &   Comp &   Dist &   EQ &   Rev \\
\hline
 wawrm         &     582 &      0 &      0 &    1 &     0 \\
 brighter      &     531 &      0 &      0 &    7 &     0 \\
 tezcher       &     350 &      0 &      1 &    0 &     0 \\
 pure          &      34 &      0 &      0 &    1 &     0 \\
 rose          &      33 &      0 &      0 &    0 &     1 \\
 air           &      31 &      0 &      0 &   18 &    13 \\
 crush         &      29 &      3 &      1 &    0 &     0 \\
 snap          &      22 &      0 &      0 &    2 &     0 \\
 voix          &      22 &      0 &      0 &    2 &     0 \\
 clinin        &      21 &      0 &      0 &    1 &     0 \\
 surgical      &      21 &      0 &      0 &    1 &     0 \\
 bass          &      20 &      3 &      4 &   13 &     0 \\
 gated         &      19 &      0 &      0 &    0 &     1 \\
 njatbet       &      18 &      1 &      0 &    0 &     0 \\
 gain      

## Get the data from SocialFX 
Prem, Pardo paper 

In [5]:
# Read in csv files
file1 = file('data/raw/eq_contributions.csv')
file2 = file('data/raw/reverb_contributions.csv')
file3 = file('data/raw/comp_contributions.csv')
eq_reader = csv.reader(file1)
rev_reader = csv.DictReader(file2)
comp_reader = csv.DictReader(file3)

# Find descriptors in common
# EQ words
eq_desc = [row[0] for row in eq_reader]

# Reverb Words
reverb_rawwords = []
reverb_agreed = []
reverb_desc = []
for row in rev_reader:
    reverb_rawwords.append(row["rawwords"])
    reverb_agreed.append(row["agreed"])

for i in range(len(reverb_rawwords)):
    rawwords = reverb_rawwords[i].split(',')
    for string in rawwords:
        if string != "" and string != "none of the above":
            reverb_desc.append(string)
for i in range(len(reverb_agreed)):
    agreed = reverb_agreed[i].split(',')
    for string in agreed:
        if string != "" and string != "none of the above":
            reverb_desc.append(string)

# Compression Words
comp_rawwords = []
comp_agreed = []
compuserid = []
comp_desc = []
for row in comp_reader:
    comp_rawwords.append(row["rawwords"])
    comp_agreed.append(row["agreed"])
    compuserid.append(row["userid"])

for i in range(len(comp_rawwords)):
    rawwords = comp_rawwords[i].split(',')
    for string in rawwords:
        if string != "" and string != "none of the above":
            comp_desc.append(string)
for i in range(len(comp_agreed)):
    agreed = comp_agreed[i].split(',')
    for string in agreed:
        if string != "" and string != "none of the above":
            comp_desc.append(string)

eq_total = len(eq_desc)
rev_total = len(reverb_desc)
comp_total = len(comp_desc)
eq_count = dict(Counter(eq_desc))
rev_count = dict(Counter(reverb_desc))
comp_count = dict(Counter(comp_desc))


In [6]:
terms = comp_desc + eq_desc + reverb_desc
uTerms = np.unique(terms)
numEntries = [len([1 for t in terms if term in t]) for term in uTerms]
# print len(np.unique(eq_desc))

In [7]:
#remove 2-letter words
T = 50
entries_s = np.vstack([[uTerms[i], numEntries[i]] for i in sorted(range(0, len(uTerms)), key=lambda k: numEntries[k], reverse=True) if uTerms[i] and numEntries[i] > T])
entries_s = np.array([i for i in entries_s if len(i[0]) > 2]).transpose()

compEntries = np.array([len([1 for term in comp_desc if uniqueTerm in term]) for uniqueTerm in entries_s[0]]).transpose()
eqEntries = np.array([len([1 for term in eq_desc if uniqueTerm in term]) for uniqueTerm in entries_s[0]]).transpose()
reverbEntries = np.array([len([1 for term in reverb_desc if uniqueTerm in term]) for uniqueTerm in entries_s[0]]).transpose()
pluginDists = np.vstack( (compEntries, eqEntries, reverbEntries))
M,N = np.shape(entries_s)
print tabulate(np.vstack([np.array(range(N))+1, entries_s[:N, :], pluginDists[:N,:]]).transpose(), headers=['N', 'term', 'Total', 'Comp', 'EQ', 'Rev'], floatfmt=".3f", tablefmt='latex')

\begin{tabular}{rlrrrr}
\hline
   N & term         &   Total &   Comp &   EQ &   Rev \\
\hline
   1 & echo         &    2396 &    118 &    0 &  2278 \\
   2 & loud         &    1308 &    261 &   21 &  1026 \\
   3 & tin          &    1212 &     89 &   28 &  1095 \\
   4 & low          &    1154 &     92 &   16 &  1046 \\
   5 & war          &    1137 &    147 &   60 &   930 \\
   6 & warm         &    1057 &    135 &   59 &   863 \\
   7 & church       &    1033 &      8 &    0 &  1025 \\
   8 & big          &     934 &     55 &    1 &   878 \\
   9 & spacious     &     855 &     62 &    0 &   793 \\
  10 & distant      &     848 &     29 &    2 &   817 \\
  11 & deep         &     787 &     31 &    6 &   750 \\
  12 & muffle       &     634 &     85 &    4 &   545 \\
  13 & muffled      &     623 &     81 &    4 &   538 \\
  14 & hall         &     584 &      7 &    0 &   577 \\
  15 & clear        &     567 &    126 &    8 &   433 \\
  16 & ring         &     537 &     24 &    7 &   