# Build and Zip PNG Files - Hackathon

In this notebook, we'll take the `basic` data set, use `ibmseti` Python package to convert each data file into a spectrogram, then save as `.png` files.


Then, we'll split the data set into a training set and a test set and create a handful of zip files for each class. This will dovetail into the next tutorial where we will train a custom Watson Visual Recognition classifier (we will use the zip files of pngs) and measure it's performance with the test s

## Spark Enterprise Cluster

This notebook is currently written to run on the Spark Enterprise Cluster. That is, the variables point to the data locations on the Enterprise Cluster. 

#### PowerAI

However, if you wish to run this on the PowerAI systems at the Hackathon, read cell 3 below. You only need to uncomment a few lines so that variables point to the data locations on the PowerAI system.

In [6]:
from __future__ import division

import cStringIO
import glob
import json
import requests
import ibmseti
import os
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import pickle

In [2]:
### SET YOUR TEAM NAME HERE! Use this folder to save intermediate results
teamname = 'Machine_Contact_Enterprise_SETI'

mydatafolder = os.path.join( os.environ['PWD'], teamname )

In [3]:
testset_container = 'simsignals_basic_test_hackathon_final'
testset_zip_file = 'basic4_test_hackathon_final.zip'
testset_csv_container = 'simsignals_basic_test_hackathon_final'
testset_csv_file = 'public_list_basic_test_hackathon_final.csv'

base_url = 'https://dal.objectstorage.open.softlayer.com/v1/AUTH_cdbef52bdf7a449c96936e1071f0a46b'

os.system('curl {}/{}/{} > {}'.format(base_url, testset_container, testset_zip_file, mydatafolder + '/' + testset_zip_file))
os.system('curl {}/{}/{} > {}'.format(base_url, testset_csv_container, testset_csv_file, mydatafolder + '/' + testset_csv_file))

0

In [4]:
# Create folder for zip to extract to
setiDataDir = mydatafolder + '/data_test_final'
if os.path.exists(setiDataDir) is False:
    os.makedirs(setiDataDir)
    
# Create folder for future spectrograms
outputpng_folder = mydatafolder + '/cleanPngsTestFinal'
if os.path.exists(outputpng_folder) is False:
    os.makedirs(outputpng_folder)

In [7]:
# Extract Data folder
def unzip_data(dest, zipf):
    # zipf = array of zip files     
    for i in zipf:
        zz = zipfile.ZipFile(mydatafolder + '/' + i )
        zz.extractall(dest)
    
unzip_data(setiDataDir, ['basic4_test_hackathon_final.zip'])

In [9]:
!ls $setiDataDir

00096d4c-7e03-4c9b-b5e0-671900604308.dat
00243ff7-a949-4204-ac62-ff94dd11825c.dat
00944787-9d58-4df7-9103-bb3dc8b8c1b9.dat
009ac0e0-0ddd-43a1-a003-154cfb442241.dat
0126582e-dd2e-4157-95bd-affb1683ba04.dat
0155c032-c06e-4d5b-9f64-29ca986594f8.dat
015e1d76-5810-468a-8409-df25748d336f.dat
016f4745-7434-43b6-9a33-58d43cbc2678.dat
0171b373-7d9e-4468-8ace-99205a63d256.dat
019aefd1-fbc4-4011-bfaf-fa449b64bbc0.dat
01cc5d4a-82e4-49ed-97fa-b0bfc033b68c.dat
01f59627-c60f-4d0d-882e-107a5da9bafa.dat
02102286-eeb2-4bcc-8fd4-6152ed6ed595.dat
024378bc-9ca5-4092-b4e2-de7d5508cb7b.dat
02437a5e-7eb2-4963-ae90-b77bce703c39.dat
0250400d-e0c9-467d-b28b-7715110817df.dat
0252d7fb-2f7f-49d6-8584-1a6f78f5828e.dat
0256b5a4-48bb-4ba1-9c65-811a49acb1ef.dat
0272efca-b602-4b59-afec-2139b3733e5f.dat
0278448b-2db7-42ee-b124-cff7540b3676.dat
02a1214c-abdf-4372-b205-3fb730318820.dat
02d2fd3b-6006-41c5-9948-87b2de845633.dat
02e6fdec-53d2-4e11-b9e7-e398d259c5b9.dat
031f0c50-e323-4395-b8f0-69f4d6fd5a20.dat
03516bb0-8d86-45

In [10]:
# Choose your data set!
workingDataDir = setiDataDir
workingIndexFile = os.path.join( mydatafolder + '/public_list_basic_test_hackathon_final.csv' )

In [11]:
with open(os.path.join(os.environ['PWD'],"Machine_Contact_Enterprise_SETI/data/noise_model", 'avg_noise_spec.pickle')) as f:
    ave_noise_spec = pickle.load(f)
ave_noise_spec

array([[ 1882824.09552799,  1838358.5686519 ,  1869188.49484221, ...,
         2038357.60989169,  1893195.59065295,  1960076.585088  ],
       [ 1974230.7996238 ,  1941939.1676425 ,  1919398.73029401, ...,
         2050268.56220764,  1953144.86416195,  1961007.43239373],
       [ 1871181.80982976,  1905521.36430901,  2024956.35432492, ...,
         2025629.76995671,  1906223.3568083 ,  1982021.0262462 ],
       ..., 
       [ 2000243.12418183,  1944909.19270913,  2005465.52434639, ...,
         1922927.53579912,  1984581.82233909,  1914908.2979095 ],
       [ 2030607.66873829,  1933643.62808185,  1933235.04925246, ...,
         1923729.8729299 ,  1920887.96130031,  2072378.2660778 ],
       [ 1902755.62468942,  1933049.40425005,  1932979.38558076, ...,
         1983142.21288803,  1936541.57227182,  1940510.06554566]])

In [12]:
#Use `ibmseti`, or other methods, to draw the spectrograms

def draw_spectrogram(data):
    
    aca = ibmseti.compamp.SimCompamp(data)
    spec = aca.get_spectrogram()
    
    
    # Instead of using SimCompAmp.get_spectrogram method
    # perform your own signal processing here before you create the spectrogram
    #
    # SimCompAmp.get_spectrogram is relatively simple. Here's the code to reproduce it:
    #
    # header, raw_data = r.content.split('\n',1)
    # complex_data = np.frombuffer(raw_data, dtype='i1').astype(np.float32).view(np.complex64)
    # complex_data = complex_data - complex_data.mean()  # have to subtract off any DC offset
    # shape = (32, 6144)
    # spec = np.abs( np.fft.fftshift( np.fft.fft( complex_data.reshape(*shape) ), 1) )**2
    # 
    # But instead of the line above, can you maniputlate `complex_data` with signal processing
    # techniques in the time-domain (windowing?, de-chirp?), or manipulate the output of the 
    # np.fft.fft process in a way to improve the signal to noise (Welch periodogram, subtract noise model)? 
    # 
    # example: Apply Hanning Window
    # complex_data = complex_data.reshape(*shape)
    # complex_data = complex_data * np.hanning(complex_data.shape[1])
    # spec = np.abs( np.fft.fftshift( np.fft.fft( complex_data ), 1) )**2

    
    ## Noise Subtraction
    #
    #  If you are building an average noise spectrogram model for subtraction, you should do that here.
    #
    #  See the Example to build an average noise spectrogram: 
    #
    #  Important point: If you do signal processing above to the raw data, you should apply the exact same signal processing
    #     when you calculate your average noise spectrogram
    #
    spec = spec - ave_noise_spec
     

    fig, ax = plt.subplots(figsize=(10, 5))   

    # do different color mappings affect Watson's classification accuracy?
    
    # ax.imshow(np.log(spec), aspect = 0.5*float(spec.shape[1]) / spec.shape[0], cmap='hot')
    # ax.imshow(np.log(spec), aspect = 0.5*float(spec.shape[1]) / spec.shape[0], cmap='gray')
    # ax.imshow(np.log(spec), aspect = 0.5*float(spec.shape[1]) / spec.shape[0], cmap='Greys')
    
    ax.imshow(spec, aspect = 0.5*float(spec.shape[1]) / spec.shape[0])
    
    ##
    ## For other ways to create Images, see: 
    ## tutorials/Step_5c_Convert_TS_to_unit8Dataset_DSX.ipynb
    ##
    
    return fig, aca.header()


def convert_to_spectrogram_and_save(row):
    
    try:
        uuid, classification = row.split(',')
    except:
        uuid = row #this handles the test data since it doesn't have "SIGNAL_CLASSIFICATION" in index file
        classification = 'unknown: test data'
        
        
    #create path to local data file
    filename = uuid + '.dat'
    filepath = os.path.join(workingDataDir, filename)
    
    #retrieve that data file
    rawdata = open(filepath).read()
    
    
    fig, header = draw_spectrogram(rawdata)
    
    png_file_name = filename + '.png'
    fig.savefig( os.path.join(outputpng_folder, png_file_name) )
    plt.close(fig)
    
    return (filename, header, png_file_name)

In [13]:
rdd = sc.textFile(workingIndexFile, 30).filter(lambda x: x.startswith('UUID') is False) #the filter removes the header

In [14]:
rdd.count()

1498

In [15]:
results = rdd.map(convert_to_spectrogram_and_save).collect()

In [16]:
results[0]

(u'00096d4c-7e03-4c9b-b5e0-671900604308.dat',
 {u'uuid': u'00096d4c-7e03-4c9b-b5e0-671900604308'},
 u'00096d4c-7e03-4c9b-b5e0-671900604308.dat.png')