In [1]:
import pandas as pd
import requests as rq
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
import random
import numpy as np
import h5py
import os
from __future__ import print_function
from shapely import geometry
import fiona
import ipyparallel as ipp
import geopandas as gpd
from geopandas.geoseries import *
from geopandas.tools import sjoin
import shapefile
from shapely.geometry import shape, Point
import pickle

### Directory paths to the placepulse images and the augmentation data

In [2]:
imgDir = "/work/sagarj/Work/BellLabs/streetview/PPImages/"
AugDir = "/work/sagarj/Work/BellLabs/streetview/USAEasternAugImages/"

### Load placepulse vote data

In [3]:
df = pd.read_csv("../streetview/votes.csv")

### Shape file for localizing datasets

In [4]:
londonShp = "../shapeFiles/London_Ward_CityMerged.shp"
USAUAShp = "../shapeFiles/cb_2016_us_ua10_500k.shp"

In [5]:
df.keys()

Index([u'left_id', u'right_id', u'winner', u'left_lat', u'left_long',
       u'right_lat', u'right_long', u'category'],
      dtype='object')

In [None]:
df.head()

### check if a lat long falls in the shapefile polygon

In [None]:
def check(point, polygon):
    if any(polygon.contains(point)):
        return True
    else:
        return False

def checkPoly(point,polygonArray):
    truths = [check(point , poly) for poly in polygonArray]
    return any(truths)

### Create a dictionary of localized data

In [None]:
geometryDict = {}
for idx, row in df.iterrows():
    if row['left_id'] not in geometryDict:
        p = Point(row['left_long'] , row['left_lat'])
        geometryDict[row['left_id']] = p
    if row['right_id'] not in geometryDict:
        p = Point(row['right_long'] , row['right_lat'])
        geometryDict[row['right_id']] = p

In [None]:
geometryDict[geometryDict.keys()[2]].x

In [None]:
usa = gpd.read_file("../shapeFiles/cb_2016_us_ua10_500k.shp")

In [None]:
# london = gpd.read_file("../shapeFiles/statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp")

In [None]:
usa.head()

In [None]:
# pd.set_option('display.max_rows', len(usa['NAME10']))
# print(usa['NAME10'])
# pd.reset_option('display.max_rows')

In [None]:
# us_selected_poly = ['Washington, DC--VA--MD' , 'New York--Newark, NY--NJ--CT' , 'Boston, MA--NH--RI' , 'Seattle, WA' , 'Portland, OR--WA' , 
#                     'Denver--Aurora, CO' , 'San Diego, CA' , 'San Francisco--Oakland, CA']
us_selected_poly = ['Washington, DC--VA--MD' , 'New York--Newark, NY--NJ--CT' , 'Boston, MA--NH--RI']
# london_selected_poly = ['Kensington and Chelsea','Westminster','Lambeth','Southwark','Hammersmith and Fulham','City of London','Islington','Camden']

In [None]:
# london_poly = [london[(london['NAME'] == k)]['geometry'] for k in london_selected_poly]

### Select specific images from polygons

In [None]:
us_poly = [usa[(usa['NAME10'] == k)]['geometry'] for k in us_selected_poly]

In [None]:
print(us_poly)

In [None]:
checkPoly(geometryDict[geometryDict.keys()[1]] , us_poly )

In [None]:
us_poly_Ids = [k for k in geometryDict if checkPoly(geometryDict[k],us_poly)]

In [None]:
# london_poly_Ids = [k for k in geometryDict if checkPoly(geometryDict[k],london_poly)]

In [None]:
len(us_poly_Ids)# ,len(london_poly_Ids)

In [None]:
us_poly_Ids[1]

### group images by category as we plan to analyse based on dimension of the image

In [None]:
grouped = df.groupby('category')

In [None]:
categoryPosts = {}
for k in grouped.groups.keys():
    categoryPosts[k] = grouped.get_group(k)

In [None]:
categoryPosts.keys()

In [None]:
DimensionDF = categoryPosts['beautiful']

In [None]:
len(DimensionDF)

In [None]:
numbers = DimensionDF.groupby(['left_id'])

In [None]:
VotesDistribution = numbers.size()

### Use truse skill to convert images from a particular dimension to an ordinal scale 

In [None]:
from trueskill import Rating , rate_1vs1 , rate
def trueSkillRate(df):
    skills = {}
    ratingTable = []
    for index, row in df.iterrows():
        if row['left_id'] not in skills:
            skills[row['left_id']] = Rating()
        if row['right_id'] not in skills:
            skills[row['right_id']] = Rating()
            
        if row['winner'] == 'left':
            nRLeft , nRRight = rate_1vs1(skills[row['left_id']] , skills[row['right_id']] )
            skills[row['left_id']] = nRLeft
            skills[row['right_id']] = nRRight
            touple = {row['left_id']:nRLeft , row['right_id'] : nRRight}
            ratingTable.append(touple)
        elif row['winner'] == 'right':
            nRRight , nRLeft = rate_1vs1(skills[row['right_id']] , skills[row['left_id']] )
            skills[row['left_id']] = nRLeft
            skills[row['right_id']] = nRRight
            touple = {row['left_id']:nRLeft , row['right_id'] : nRRight}
            ratingTable.append(touple)
        else:
            nRRight , nRLeft = rate_1vs1(skills[row['right_id']] , skills[row['left_id']] , drawn = True )
            skills[row['left_id']] = nRLeft
            skills[row['right_id']] = nRRight
            touple = {row['left_id']:nRLeft , row['right_id'] : nRRight}
            ratingTable.append(touple)
    
    return skills , ratingTable

### Code to do stability analysis of the trueskill rating (can be used for other methods) to see how many samples change ordinal ratings as you consider more and more competetions

In [None]:
def stabilityAnalysis(df , start , end):
    classflips = []
    rootflips = []
    numbers = DimensionDF.groupby('left_id')
    TargetDf = numbers.filter(lambda x: len(x) > start)
    depressingSkills , _ = trueSkillRate(TargetDf)
    root1 = [k for k in depressingSkills if depressingSkills[k].mu > 25]
    root2 = [k for k in depressingSkills if depressingSkills[k].mu < 25]

    for i in range(start,end+1):
        
        TargetDf = numbers.filter(lambda x: len(x) > i)
        depressingSkills , _ = trueSkillRate(TargetDf)
        r1 = [k for k in depressingSkills if depressingSkills[k].mu > 25]
        r2 = [k for k in depressingSkills if depressingSkills[k].mu < 25]
        
        TargetDf = numbers.filter(lambda x: len(x) > i+1) 
        depressingSkills , _ = trueSkillRate(TargetDf)
        class1 = [k for k in depressingSkills if depressingSkills[k].mu > 25]
        class2 = [k for k in depressingSkills if depressingSkills[k].mu < 25]
        
        
        flips_root = len(set(root1).intersection(class2)) + len(set(root2).intersection(class1))
        flips_consecutive = len(set(r1).intersection(class2)) + len(set(r2).intersection(class1))
        print ("Total class flips at threshold level %d from %d : %d , %d from root" %(i,i+1,flips_consecutive , flips_root ))
        rootflips.append(flips_root)
        classflips.append(flips_consecutive)

    return classflips , rootflips
        

In [None]:
from math import sin, cos, sqrt, atan2, radians
def linearDist( point1 , point2 ):
    #radius of earth in meters
    R=6378.137
    #Coordinate offsets in radians
    lat1 = radians(point1.y)
    lon1 = radians(point1.x)
    lat2 = radians(point2.y)
    lon2 = radians(point2.x)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    absDist = R * c
#     if absDist/10.0 < 1:
#         distance = 10
#     elif absDist/100.0 < 1:
#         distance = 100
#     elif absDist/500.0 < 1:
#         distance = 500
#     elif absDist/1000.0 < 1:
#         distance = 1000
#     else:
#         distance = 5000
        
    return absDist
 

In [None]:
# rollingFlips, rootFlips  = stabilityAnalysis(DimensionDF, 3 , 11)

In [None]:
# _, ax1 = plt.subplots(figsize=(15, 10))
# ax1.plot(range(4,13), rollingFlips  , 'b' )
# ax1.plot(range(4,13) , rootFlips  , 'g')

# ax1.set_xlabel('Votes Threshold', fontsize = 20)
# ax1.set_ylabel('Samples that switch classes' , fontsize = 20)
# plt.title("Trueskill Stability" , fontsize = 20)
# ax1.legend(["Flips compared to last threshold" , "Flips compared to threshold of 4 votes"])




In [None]:
sns.set(rc={"figure.figsize": (16, 12)})
sns.set(font_scale=1)  
ax = sns.distplot(VotesDistribution )
ax.set(xlabel='Votes', ylabel='Population')


### Find the final Dataframe that we use for rankings

In [None]:
TargetDf = numbers.filter(lambda x: len(x) > 1) 

In [None]:
usa_df = TargetDf[(TargetDf['left_id'].isin(us_poly_Ids))]

In [None]:
len(list(set(usa_df['left_id'])))

In [None]:
# usa_df.to_csv("../streetview/easternCities2Votes.csv")

In [None]:
# fivevotes = pd.read_csv("../streetview/5votes.csv")
# len(fivevotes)

In [None]:
len(list(set(TargetDf['left_id'])))

In [None]:
len(TargetDf)

In [None]:
len(TargetDf[(TargetDf['winner'] == 'equal')])

In [None]:
from trueskill import Rating

In [None]:
from trueskill.backends import available_backends
from trueskill import setup
from mpmath import mp
if 'mpmath' in available_backends():
    # mpmath can be used in the current environment
    setup(backend='mpmath')
mp.dps = 32

In [None]:
depressingSkills , depRatingTable = trueSkillRate(TargetDf)

In [None]:
#Filter the final ratings based on the US city ids of images
finalSkills = {}
# leftids = list(set(TargetDf['left_id']))
leftids = us_poly_Ids
for k in depressingSkills:
    if k in leftids:
        finalSkills[k] = depressingSkills[k]

In [None]:
from itertools import combinations
def doSampleTesting(skills , sampleSize, iters , distThresh):
    median = []
    variance = []
    for i in range(iters):
        sample = random.sample(skills.keys(),sampleSize)
        deltaRating = []
        for combo in combinations(sample, 2):
            dist = linearDist(geometryDict[combo[0]] , geometryDict[combo[1]])
            if dist < distThresh:
                deltaRating.append(abs(skills[combo[0]].mu - skills[combo[1]].mu))
        median.append(np.median(deltaRating))
        variance.append(np.var(deltaRating))
    return median , variance

In [None]:
sample = random.sample(finalSkills.keys(),300)

In [None]:
from itertools import combinations

distances = []
deltaRating = []
# c= ipp.Client()
for combo in combinations(sample, 2):
    #d = c[0].apply_async(linearDist , geometryDict[combo[0]] , geometryDict[combo[1]])
    dist = linearDist(geometryDict[combo[0]] , geometryDict[combo[1]])
    if dist < 10:
        distances.append(dist)
        #delta = c[1].apply_async(abs ,(finalSkills[combo[0]].mu - finalSkills[combo[1]].mu))
        deltaRating.append(abs(finalSkills[combo[0]].mu - finalSkills[combo[1]].mu))
    
    #distances.append(d)
    #deltaRating.append(delta)

In [None]:
deltaMedians , deltaVar = doSampleTesting(finalSkills , 300 , 100 , 10)

In [None]:
len(deltaMedians)

In [None]:
#g = sns.jointplot(np.asarray(distances), np.asarray(deltaRating), kind="reg")
ax = sns.distplot(deltaMedians)
ax = sns.distplot(deltaVar)


In [None]:
len(list(set(finalSkills.keys())))

In [None]:
depRatingTable[1]

In [None]:
# rated_depSkills = rate(depRatingTable)

In [None]:
# skill = {}
# for s in rated_depSkills:
#     for k in s:
#         if k in skill:
#             skill[k].append(s[k])
#         else:
#             skill[k] = []
#             skill[k].append(s[k])

In [None]:
#Sorted by plain pairvise matches
sortedImagesSkillsMu = sorted(finalSkills, key=lambda k: depressingSkills[k].mu)
sortedImagesSkillsSigma = sorted(finalSkills, key=lambda k: depressingSkills[k].sigma)

In [None]:
#Sorted by ranking tables
# sortedImages =  sorted(skill, key=lambda k: skill[k][-1].mu)

In [None]:
# sortedImagesByVar =  sorted(skill, key=lambda k: skill[k][-1].sigma)

In [None]:
# skill[sortedImages[-1]][0].mu

In [None]:
#Sorted by ranking tables
# imgid = sortedImages[-500]
# Image(imgDir + imgid + ".jpg")

In [None]:
def getPointMapping(df):
    geometryDict = {}
    for idx, row in df.iterrows():
        if row['left_id'] not in geometryDict:
            geometryDict[row['left_id']] = Point((row['left_lat'] , row['left_long']))
        if row['right_id'] not in geometryDict:
            geometryDict[row['right_id']] = Point((row['right_lat'] , row['right_long']))
    return geometryDict

    

In [None]:
len(sortedImagesSkillsMu)

In [None]:
# Sorted by plain pairvise matches
imgid = sortedImagesSkillsMu[-2]
Image(imgDir + imgid + ".jpg")

In [None]:
# dir(rated_depSkills[1][1])

In [None]:
skillMeans = [finalSkills[k].mu for k in finalSkills]
#skillMeans = [depressingSkills[k].mu for k in depressingSkills]

In [None]:
maxSkill= np.max(skillMeans)

In [None]:
print (maxSkill)

In [None]:
skillSigmas = [finalSkills[k].sigma for k in finalSkills]

In [None]:
# sigmas = [skill[k][-1].sigma for k in skill]

In [None]:
sns.set(rc={"figure.figsize": (16, 12)})
sns.set(font_scale=1) 
# ax = sns.distplot(skillMeans , hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True) )
ax = sns.distplot(skillMeans )
ax.set(xlabel='Score', ylabel='Population')

In [None]:
selectedDf = dict((k, [finalSkills[k].mu ]) for k in finalSkills if (finalSkills[k].mu < 22 or finalSkills[k].mu > 28))

In [None]:
#selectedDf = dict((k, [depressingSkills[k].mu ]) for k in depressingSkills if (depressingSkills[k].mu < 17 or depressingSkills[k].mu > 32))

In [None]:
len(selectedDf.keys())

In [None]:
testDf = {}
for k in selectedDf:
    path = imgDir + k + ".jpg"
    if selectedDf[k][0] > 28:
        d = {'key':k , 'trueSkill' : selectedDf[k] , 'label' : 1 , 'path' : path}
    elif selectedDf[k][0] < 22:
        d = {'key':k , 'trueSkill' : selectedDf[k] , 'label' : 0 , 'path' : path}
    else:
        continue
    testDf[k] = dict()
    testDf[k] = d




# fringeDf = {}
# for k in selectedDf:
#     path = imgDir + k + ".jpg"
#     if selectedDf[k][0] <19 :
#         d = {'key':k , 'trueSkill' : selectedDf[k] , 'label' : 0 , 'path' : path}
#     elif selectedDf[k][0] > 33:
#         d = {'key':k , 'trueSkill' : selectedDf[k] , 'label' : 1 , 'path' : path}
#     else:
#         continue
#     fringeDf[k] = dict()
#     fringeDf[k] = d

In [None]:
len(testDf)

In [None]:
# with open('../Data/cityDf.pkl', 'wb') as handle:
#     pickle.dump(testDf, handle , protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# sampledFringe = random.sample( testDf.items(), 5000 )

In [None]:
sampledFringe[1][1]

In [None]:
# labels = [sampledFringe[i][1]['label'] for i in range(len(sampledFringe))]
# np.sum(labels)

In [None]:
# with open('../Data/testImages.pkl', 'wb') as handle:
#     segnetLabels = pickle.dump(sampledFringe, handle , protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
imgid = selectedDf.keys()[-100]
#imgid = "51422739fdc9f04926008637"
Image(imgDir + imgid + ".jpg")
#print( selectedDf[selectedDf.keys()[-1]])

In [None]:
len(selectedDf)

In [None]:
filtSkills = dict((k, depressingSkills[k]) for k in depressingSkills if k in us_poly_Ids)

In [None]:
# with open("beautyKeys.txt", "w") as f:
#     for key in depressingSkills:
#         f.write(key+"\n")

In [None]:
filtSkills.keys()[10]

In [None]:
skillsCity = [filtSkills[i].mu for i in filtSkills]

In [None]:
sns.set(rc={"figure.figsize": (16, 12)})
sns.set(font_scale=1) 
# ax = sns.distplot(skillMeans , hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True) )
ax = sns.distplot(skillsCity )
ax.set(xlabel='Score', ylabel='Population')

In [None]:
# sns.distplot(means )

In [None]:
# sns.distplot(sigmas ,kde_kws={"color": "b", "lw": 2, "label": "Paiwise Trueskill Std. Deviation"} )
sns.distplot(skillSigmas , kde_kws={"color": "g", "lw": 2, "label": "Ranked table Trueskill Std. Deviation"},)

In [None]:
# Qdf = numbers.filter(lambda x: len(x) > 0) 

In [None]:
def QscoreRating(df):
    W = {}
    L = {}
    win = {}
    loose = {}
    contest = {}
    rating = {}
    for index, row in df.iterrows():
        if row['left_id'] not in contest:
            contest[row['left_id']] = 0
            
            win[row['left_id']] = {}
            win[row['left_id']]['w'] = 0
            win[row['left_id']]['loosers'] = []
            
            loose[row['left_id']] = {}
            loose[row['left_id']]['l'] = 0 
            loose[row['left_id']]['winners'] = []
            
            W[row['left_id']] =0.0
            L[row['left_id']] =0.0
        
        contest[row['left_id']]+=1
        
        if row['winner'] == 'left':
            win[row['left_id']]['w'] += 1
            win[row['left_id']]['loosers'].append(row['right_id'])

        if row['winner'] == 'right':
            loose[row['left_id']]['l'] += 1
            loose[row['left_id']]['winners'].append(row['right_id'])
        
    for k in win:
        W[k] = float(win[k]['w'])/float(contest[k])
        
    for k in loose:
        L[k] = float(loose[k]['l'])/float(contest[k])
    
    for k in contest:
        S1 = sum([W.get(i,0) for i in win[k]['loosers']])
        S2 = sum([L.get(i,0) for i in loose[k]['winners']])
        if S1 == 0:
            S1 = 0.0
        else:
            S1 = float(S1)/float(win[k]['w'])
        
        if S2 == 0:
            S2 = 0.0
        else:
            S2 = float(S2)/float(loose[k]['l'])
        
        r = (10.0/3.0)*(W[k] + S1 - S2 + 1)
        rating[k] = r
    return rating
            

In [None]:
# Qscores = QscoreRating(Qdf)

In [None]:
# len(set(Qdf['left_id']))

In [None]:
# finalQ = {}
# for index, row in TargetDf.iterrows():
#     finalQ[row['left_id']] = Qscores[row['left_id']]

In [None]:
# len(finalQ)

In [None]:
# sns.set(rc={"figure.figsize": (16, 12)})
# sns.distplot(finalQ.values() )
# #sns.distplot(finalQ.values() , hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True) )

In [None]:
# sortedSimple =  sorted(finalQ, key=lambda k: finalQ[k])

In [None]:
# len(sortedSimple)

In [None]:
# imgid = sortedSimple[3500]
# # imgid = "50f43ba4fdc9f065f000326b"
# Image(imgDir + imgid + ".jpg")

In [None]:
import os
import glob
import random
import numpy as np
import pickle

import cv2
import sys
caffe_root = '/work/sagarj/caffe-rc5/'  # this file should be run from {caffe_root}/examples (otherwise change this line)
sys.path.insert(0, caffe_root + 'python')

import caffe
from caffe.proto import caffe_pb2
import lmdb

#Size of images
IMAGE_WIDTH = 227
IMAGE_HEIGHT = 227

def transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT):

    #Histogram Equalization
    img[:, :, 0] = cv2.equalizeHist(img[:, :, 0])
    img[:, :, 1] = cv2.equalizeHist(img[:, :, 1])
    img[:, :, 2] = cv2.equalizeHist(img[:, :, 2])

    #Image Resizing
    img = cv2.resize(img, (img_width, img_height), interpolation = cv2.INTER_CUBIC)

    return img


def make_datum(img, label):
    #image is numpy.ndarray format. BGR instead of RGB
    return caffe_pb2.Datum(
        channels=3,
        width=IMAGE_WIDTH,
        height=IMAGE_HEIGHT,
        label=label,
        data=np.rollaxis(img, 2).tostring())

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


In [None]:
testKeys = random.sample(list(finalSkills.keys()), int(0.2*len(finalSkills.keys())))
#testKeys = random.sample(list(depressingSkills.keys()), int(0.2*len(depressingSkills.keys())))
#testKeys = random.sample(list(filtSkills.keys()), int(0.15*len(filtSkills.keys())))


In [None]:
trainKeys = [k for k in finalSkills.keys() if k not in testKeys]
#trainKeys = [k for k in depressingSkills.keys() if k not in testKeys]
#trainKeys = [k for k in filtSkills.keys() if k not in testKeys]

In [None]:
len(set(trainKeys).intersection(testKeys))

In [None]:
len(testKeys) , len(trainKeys)

In [None]:
import random
train_data = {}
for k in trainKeys:
    src = [imgDir + k + ".jpg"]
    if not os.path.exists(src[0]):
        continue
    else:
        if os.path.exists(AugDir + k):
            images = os.listdir(AugDir + k)
            random.shuffle(images)
            for i in images[:20]:
                src.append(AugDir + k + "/" + i)
        train_data[k] = src
        
test_data = {}
for k in testKeys:
    src = [imgDir + k + ".jpg"]
    if not os.path.exists(src[0]):
        continue
    else:
        if os.path.exists(AugDir + k):
            images = os.listdir(AugDir + k)
            random.shuffle(images)
            for i in images[:20]:
                src.append(AugDir + k + "/" + i)
        test_data[k] = src
    
    

In [None]:
len(train_data) , len(test_data)

In [None]:
s = random.sample(train_data.keys(), 300)

In [None]:
# augTest = {}
# for k in s: 
#     if len(train_data[k]) > 1:
#         augTest[k] = train_data[k]
# save_obj(augTest , "sampledAugment.pk")

In [None]:
train_data[train_data.keys()[1]]

## Create train and test path files for transfer learning

In [None]:
# ImageList = "../Data/TrainImageListBinary_augmented.txt"
# label = 0
# for in_idx, (k, img_path) in enumerate(train_data.items()):
#     val = int(depressingSkills[k].mu)
# #     if val < 23:
# #         label = 0
# #     elif val > 22 and val < 29:
# #         label = 1
# #     elif val > 28:
# #         label = 2
#     if val < 21:
#         label = 0
#     elif val > 29:
#         label = 1
#     else:
#         continue
#     with open(ImageList,'a') as f:
#         for p in img_path:
#             f.write(p + "," + str(label) + "\n")

# ImageList = "../Data/TestImageListBinary_augmented.txt"
# label = 0
# for in_idx, (k, img_path) in enumerate(test_data.items()):
#     val = int(depressingSkills[k].mu)
# #     if val < 23:
# #         label = 0
# #     elif val > 22 and val < 29:
# #         label = 1
# #     elif val > 28:
# #         label = 2
#     if val < 21:
#         label = 0
#     elif val > 29:
#         label = 1
#     else:
#         continue
#     with open(ImageList,'a') as f:
#         for p in img_path:
#             f.write(p + "," + str(label) + "\n")
    

## LMDB creation Logic

In [None]:
train_lmdb = '../Data/train_lmdb_beauty_augmented_sparse'
validation_lmdb = '../Data/validation_lmdb_beauty_augmented_sparse'
log = "lmdblogs.log"

In [None]:
# print ('Creating train_lmdb')
# f = open(log,'w')
# in_db = lmdb.open(train_lmdb, map_size=int(1e12))
# with in_db.begin(write=True) as in_txn:
#     for in_idx, (k, img_path) in enumerate(train_data.items()):
#         val = int(depressingSkills[k].mu)
#         label = 0
# #         if val < 22.0:
# #             label = 0
# #         elif val >= 22.0 and val < 30.0:
# #             label = 1
# #         elif val >= 30.0:
# #             label = 2
        
#         if val < 20.0:
#             label = 0
#         elif val > 30.0:
#             label = 1
#         else:
#             continue
#         for p in img_path:
#             img = cv2.imread(p, cv2.IMREAD_COLOR)
#             img = transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT)
#             datum = make_datum(img, label)
#             in_txn.put('{:0>5d}'.format(in_idx), datum.SerializeToString())
#             line = '{:0>5d}'.format(in_idx) + ':' + p + "\n"
#             f.write(line)
# in_db.close()


# print ('\nCreating validation_lmdb')

# in_db = lmdb.open(validation_lmdb, map_size=int(1e12))
# with in_db.begin(write=True) as in_txn:
#     for in_idx, (k, img_path) in enumerate(test_data.items()):
        
#         val = int(depressingSkills[k].mu)
#         label = 0
#         label = 0
# #         if val < 22.0:
# #             label = 0
# #         elif val >= 22.0 and val < 30.0:
# #             label = 1
# #         elif val >= 30.0:
# #             label = 2
            
#         if val < 20.0:
#             label = 0
#         elif val > 30.0:
#             label = 1
#         else:
#             continue
        
#         for p in img_path:
#             img = cv2.imread(p, cv2.IMREAD_COLOR)
#             img = transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT)
#             datum = make_datum(img, label)
#             in_txn.put('{:0>5d}'.format(in_idx), datum.SerializeToString())
#             line = '{:0>5d}'.format(in_idx) + ':' + p + "\n"
#             f.write(line)
# in_db.close()
# f.close()
# print ('\nFinished processing all images')


## HD5 creation logic

In [None]:
# Extract mean from the mean image file
# mean_file_binaryproto = '../Data/Safety8Mean.binaryproto' # Mean image file
# mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto()
# f = open(mean_file_binaryproto, 'rb')
# mean_blobproto_new.ParseFromString(f.read())
# mean_image = caffe.io.blobproto_to_array(mean_blobproto_new)
# f.close()

In [None]:
# dataset_test = []
# labels = []
# BatchSize = 5000

# for in_idx, (k, img_path) in enumerate(test_data.items()):
#     img = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     img = transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT)
#     img = np.rollaxis(img, 2) - mean_image
#     dataset_test.append(img)
#     labels.append(float(depressingSkills[k].mu / maxSkill))

    
# dataset_t = np.stack(dataset_test,axis = 0)
# dataset_t = np.squeeze(dataset_t, axis=1)
# labels = np.asarray(labels)

# i = 0
# DIR = "/work/sagarj/Work/BellLabs/Data/h5Data/"

# text_fn = os.path.join(DIR, 'test_SafetyRegression.txt')
# for start, end in zip(range(0, len(dataset_t), BatchSize), range(BatchSize, len(dataset_t), BatchSize)):
    
#     h5_fn = DIR+'test_SafetyRegression' + str(i) +'.h5'
#     with h5py.File(h5_fn, 'w') as f:
#         f['data'] = dataset_t[start:end]
#         f['label'] = labels[start:end]
        
#     with open(text_fn, 'a') as f:
#         print(h5_fn, file = f)
#     i+=1


In [None]:
# dataset_t.shape , labels.shape

In [None]:
# dataset_test = []
# labels = []
# BatchSize = 5000


# for in_idx, (k, img_path) in enumerate(train_data.items()):
#     img = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     img = transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT)
#     img = np.rollaxis(img, 2) - mean_image
#     dataset_test.append(img)
#     labels.append(float(depressingSkills[k].mu / maxSkill))


# dataset_t = np.stack(dataset_test,axis = 0)
# dataset_t = np.squeeze(dataset_t, axis=1)
# labels = np.asarray(labels)
# i = 0
# DIR = "/work/sagarj/Work/BellLabs/Data/h5Data/"
# text_fn = os.path.join(DIR, 'train_SafetyRegression.txt')
# for start, end in zip(range(0, len(dataset_t), BatchSize), range(BatchSize, len(dataset_t), BatchSize)):
    
#     h5_fn = DIR+'train_SafetyRegression' + str(i) +'.h5'
#     with h5py.File(h5_fn, 'w') as f:
#         f['data'] = dataset_t[start:end]
#         f['label'] = labels[start:end]
        
#     with open(text_fn, 'a') as f:
#         print(h5_fn, file = f)
#     i+=1


In [None]:
# dataset_t.shape , labels.shape

In [None]:
# sns.distplot(labels )

## Moving Logic for moving images

In [None]:
# curatedDir = "../streetview/RankedDepress_4/"
# beauty = "1/"
# notBeauty = "0/"
# train = curatedDir + "train/"
# test = curatedDir + "test/"

In [None]:
# from shutil import copyfile


In [None]:
#Move Train SEtL 

# for k in trainKeys:
#     src = imgDir + k + ".jpg"
#     if not os.path.exists(src):
#         continue
#     if depressingSkills[k].mu > 25:
#         destDir = curatedDir + train + beauty 
#         if not os.path.exists(os.path.dirname(destDir)):
#             os.makedirs(os.path.dirname(destDir))
#         dest = destDir+ k + ".jpg"
#         copyfile(src , dest)
    
#     if depressingSkills[k].mu < 25:
#         destDir = curatedDir + train + notBeauty 
#         if not os.path.exists(os.path.dirname(destDir)):
#             os.makedirs(os.path.dirname(destDir))
#         dest = destDir + k + ".jpg"
#         copyfile(src , dest)    

In [None]:
# Move test

# for k in keys:
#     src = imgDir + k + ".jpg"
#     if not os.path.exists(src):
#         continue
#     if depressingSkills[k].mu > 25:
#         destDir = curatedDir + test + beauty 
#         if not os.path.exists(os.path.dirname(destDir)):
#             os.makedirs(os.path.dirname(destDir))
#         dest = destDir+ k + ".jpg"
#         copyfile(src , dest)
    
#     if depressingSkills[k].mu < 25:
#         destDir = curatedDir + test + notBeauty 
#         if not os.path.exists(os.path.dirname(destDir)):
#             os.makedirs(os.path.dirname(destDir))
#         dest = destDir + k + ".jpg"
#         copyfile(src , dest)    