In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Reshape, Dropout, Dense 
from tensorflow.keras.layers import Flatten, BatchNormalization
from tensorflow.keras.layers import Activation, ZeroPadding2D
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import UpSampling2D, Conv2D
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam
import numpy as np
from PIL import Image
from tqdm import tqdm
import os 
import time
import matplotlib.pyplot as plt
import pandas as pd

In [39]:
pd.options.display.max_columns = None

In [5]:
data = pd.read_spss('../data/uktus15_diary_ep_long.sav')

In [40]:
data.head()

Unnamed: 0,serial,strata,psu,pnum,daynum,HhOut,IndOut,DMFlag,IMonth,IYear,DVAge,DayNum_DiaryDay,DPday,DiaryDate_Act,DiaryDay_Act,DiaryDateDiff,dmonth,dyear,ddayw,DiaryType,WhenDiary,AfterDiaryDay,WhereStart,WhereEnd,RushedD,Ordinary,KindOfDay,Trip,dia_wt_a,dia_wt_b,epnum,tid,eptime,whatdoing,What_Oth1,What_Oth2,What_Oth3,WhereWhen,Device,WithAlone,WithSpouse,WithMother,WithFather,WithChild,WithOther,WithOtherYK,WithMiss,WithNA,Enjoy,impuflag
0,11011202.0,110.0,117.0,1.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,13637640000.0,13637640000.0,Thursday,0.0,December,2014.0,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,2.021182,1.58118,1.0,04:00-04:10,110.0,Sleep,No answer/refused,No answer/refused,No answer/refused,Home,not using device,Not reported,Reported,Not reported,Not reported,Not reported,Reported,Not reported,Co-presence reported,main act: work/edu/sleep,very much,not missing
1,11011202.0,110.0,117.0,1.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,13637640000.0,13637640000.0,Thursday,0.0,December,2014.0,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,2.021182,1.58118,2.0,05:50-06:00,10.0,Other specified TV watching,Sleep: In bed not asleep,No answer/refused,No answer/refused,Home,not using device,Not reported,Reported,Not reported,Not reported,Not reported,Reported,Not reported,Co-presence reported,Not reported,very much,not missing
2,11011202.0,110.0,117.0,1.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,13637640000.0,13637640000.0,Thursday,0.0,December,2014.0,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,2.021182,1.58118,3.0,06:00-06:10,10.0,Other personal care: Wash and dress,No answer/refused,No answer/refused,No answer/refused,Home,not using device,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,No co-presence reported,Not reported,3.0,not missing
3,11011202.0,110.0,117.0,1.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,13637640000.0,13637640000.0,Thursday,0.0,December,2014.0,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,2.021182,1.58118,4.0,06:10-06:20,10.0,Cleaning dwelling,No answer/refused,No answer/refused,No answer/refused,Home,not using device,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,No co-presence reported,Not reported,3.0,not missing
4,11011202.0,110.0,117.0,1.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,13637640000.0,13637640000.0,Thursday,0.0,December,2014.0,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,2.021182,1.58118,5.0,06:20-06:30,10.0,Food preparation and baking,No answer/refused,No answer/refused,No answer/refused,Home,using device,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,No co-presence reported,Not reported,5.0,not missing


In [11]:
def cat_vs_num(data):
    col_names = data.columns
    cat_cols = []
    num_cols = []
    for i in col_names:
        if data[i].dtypes == 'category':
            cat_cols.append(i)
        else:
            num_cols.append(i)
    return cat_cols, num_cols

In [36]:
cat_cols, num_cols = cat_vs_num(data)

In [62]:
cat_df = data[cat_cols]
num_df = data[num_cols]

In [64]:
cat_df.head()

Unnamed: 0,strata,psu,daynum,HhOut,IndOut,DMFlag,IMonth,IYear,DVAge,DayNum_DiaryDay,DiaryDay_Act,DiaryDateDiff,dmonth,ddayw,DiaryType,WhenDiary,AfterDiaryDay,WhereStart,WhereEnd,RushedD,Ordinary,KindOfDay,Trip,tid,whatdoing,What_Oth1,What_Oth2,What_Oth3,WhereWhen,Device,WithAlone,WithSpouse,WithMother,WithFather,WithChild,WithOther,WithOtherYK,WithMiss,WithNA,Enjoy,impuflag
0,110.0,117.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,Thursday,0.0,December,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,04:00-04:10,Sleep,No answer/refused,No answer/refused,No answer/refused,Home,not using device,Not reported,Reported,Not reported,Not reported,Not reported,Reported,Not reported,Co-presence reported,main act: work/edu/sleep,very much,not missing
1,110.0,117.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,Thursday,0.0,December,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,05:50-06:00,Other specified TV watching,Sleep: In bed not asleep,No answer/refused,No answer/refused,Home,not using device,Not reported,Reported,Not reported,Not reported,Not reported,Reported,Not reported,Co-presence reported,Not reported,very much,not missing
2,110.0,117.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,Thursday,0.0,December,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,06:00-06:10,Other personal care: Wash and dress,No answer/refused,No answer/refused,No answer/refused,Home,not using device,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,No co-presence reported,Not reported,3.0,not missing
3,110.0,117.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,Thursday,0.0,December,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,06:10-06:20,Cleaning dwelling,No answer/refused,No answer/refused,No answer/refused,Home,not using device,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,No co-presence reported,Not reported,3.0,not missing
4,110.0,117.0,1.0,Productive : At least one individual interview...,"Individual interview complete, diary collected","Fully productive adult, from part prod HH",December,2014.0,48.0,1.0,Thursday,0.0,December,Mon-Fri,Version 1 Adult (14+ yrs) (with Enjoyment ques...,Now and then during the diary day,Item not applicable,At home,At home,No,2.0,A day off due to the weekend/holiday or work s...,No,06:20-06:30,Food preparation and baking,No answer/refused,No answer/refused,No answer/refused,Home,using device,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,Not reported,No co-presence reported,Not reported,5.0,not missing


In [98]:
def to_cat(data):
    for i in data.columns:
        data[i] = data[i].astype('str')
    return data

In [99]:
df = to_cat(cat_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = data[i].astype('str')


In [100]:
type(df['strata'][1])

str

In [45]:
cat_df['psu'].value_counts()

810.0                      2355
419.0                      2315
104.0                      2295
398.0                      2227
462.0                      2226
                           ... 
336.0                       138
304.0                        89
Schedule not applicable      89
205.0                        79
548.0                        56
Name: psu, Length: 585, dtype: int64

In [43]:
num_df.head()

Unnamed: 0,serial,pnum,DPday,DiaryDate_Act,dyear,dia_wt_a,dia_wt_b,epnum,eptime
0,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,1.0,110.0
1,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,2.0,10.0
2,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,3.0,10.0
3,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,4.0,10.0
4,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,5.0,10.0


In [113]:
from sklearn.preprocessing import LabelEncoder

def cat_to_num(data):
    cat_cols, num_cols = cat_vs_num(data)
    cat_df = data[cat_cols]
    cat_df = to_cat(cat_df)
    num_df = data[num_cols]
    data_enc = pd.DataFrame()
	# label encode each column
    for i in range(cat_df.shape[1]):
        le = LabelEncoder()
        le.fit(cat_df.iloc[:, i])
        cat_df_enc = le.transform(cat_df.iloc[:, i])
        data_enc[cat_cols[i]] = cat_df_enc
    data_enc = pd.concat([data_enc, num_df], axis=1)
    return data_enc

In [114]:
data_enc = cat_to_num(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = data[i].astype('str')


In [115]:
data_enc.shape

(587632, 50)

In [116]:
data_enc.head()

Unnamed: 0,strata,psu,daynum,HhOut,IndOut,DMFlag,IMonth,IYear,DVAge,DayNum_DiaryDay,DiaryDay_Act,DiaryDateDiff,dmonth,ddayw,DiaryType,WhenDiary,AfterDiaryDay,WhereStart,WhereEnd,RushedD,Ordinary,KindOfDay,Trip,tid,whatdoing,What_Oth1,What_Oth2,What_Oth3,WhereWhen,Device,WithAlone,WithSpouse,WithMother,WithFather,WithChild,WithOther,WithOtherYK,WithMiss,WithNA,Enjoy,impuflag,serial,pnum,DPday,DiaryDate_Act,dyear,dia_wt_a,dia_wt_b,epnum,eptime
0,9,15,0,1,0,1,2,0,38,0,4,26,2,0,0,3,21,0,0,0,1,0,0,21,162,73,52,22,1,1,0,1,0,0,0,1,0,0,1,8,6,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,1.0,110.0
1,9,15,0,1,0,1,2,0,38,0,4,26,2,0,0,3,21,0,0,0,1,0,0,32,83,158,52,22,1,1,0,1,0,0,0,1,0,0,0,8,6,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,2.0,10.0
2,9,15,0,1,0,1,2,0,38,0,4,26,2,0,0,3,21,0,0,0,1,0,0,33,82,73,52,22,1,1,0,0,0,0,0,0,0,1,0,1,6,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,3.0,10.0
3,9,15,0,1,0,1,2,0,38,0,4,26,2,0,0,3,21,0,0,0,1,0,0,34,24,73,52,22,1,1,0,0,0,0,0,0,0,1,0,1,6,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,4.0,10.0
4,9,15,0,1,0,1,2,0,38,0,4,26,2,0,0,3,21,0,0,0,1,0,0,35,42,73,52,22,1,2,0,0,0,0,0,0,0,1,0,3,6,11011202.0,1.0,13637640000.0,13637640000.0,2014.0,2.021182,1.58118,5.0,10.0


In [119]:
data_enc.to_csv('../data/long_encoded.csv', index= False)

In [None]:
# example of learned embedding encoding for a neural network
from numpy import unique
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers.merge import concatenate
from keras.utils import plot_model

# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	X_train_enc, X_test_enc = list(), list()
	# label encode each column
	for i in range(X_train.shape[1]):
		le = LabelEncoder()
		le.fit(X_train[:, i])
		# encode
		train_enc = le.transform(X_train[:, i])
		test_enc = le.transform(X_test[:, i])
		# store
		X_train_enc.append(train_enc)
		X_test_enc.append(test_enc)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# make output 3d
y_train_enc = y_train_enc.reshape((len(y_train_enc), 1, 1))
y_test_enc = y_test_enc.reshape((len(y_test_enc), 1, 1))
# prepare each input head
in_layers = list()
em_layers = list()
for i in range(len(X_train_enc)):
	# calculate the number of unique inputs
	n_labels = len(unique(X_train_enc[i]))
	# define input layer
	in_layer = Input(shape=(1,))
	# define embedding layer
	em_layer = Embedding(n_labels, 10)(in_layer)
	# store layers
	in_layers.append(in_layer)
	em_layers.append(em_layer)
# concat all embeddings
merge = concatenate(em_layers)
dense = Dense(10, activation='relu', kernel_initializer='he_normal')(merge)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# plot graph
plot_model(model, show_shapes=True, to_file='embeddings.png')
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
# Generation resolution - Must be square 
# Training data is also scaled to this.
# Note GENERATE_RES 4 or higher  
# will blow Google CoLab's memory and have not
# been tested extensivly.
GENERATE_RES = 3 # Generation resolution factor 
# (1=32, 2=64, 3=96, 4=128, etc.)
GENERATE_SQUARE = 32 * GENERATE_RES # rows/cols (should be square)
IMAGE_CHANNELS = 3

# Preview image 
PREVIEW_ROWS = 4
PREVIEW_COLS = 7
PREVIEW_MARGIN = 16

# Size vector to generate images from
SEED_SIZE = 100

# Configuration
DATA_PATH = '/content/drive/My Drive/projects/faces'
EPOCHS = 50
BATCH_SIZE = 32
BUFFER_SIZE = 60000

print(f"Will generate {GENERATE_SQUARE}px square images.")

In [None]:
# Image set has 11,682 images.  Can take over an hour 
# for initial preprocessing.
# Because of this time needed, save a Numpy preprocessed file.
# Note, that file is large enough to cause problems for 
# sume verisons of Pickle,
# so Numpy binary files are used.
training_binary_path = os.path.join(DATA_PATH,
        f'training_data_{GENERATE_SQUARE}_{GENERATE_SQUARE}.npy')

print(f"Looking for file: {training_binary_path}")

if not os.path.isfile(training_binary_path):
  start = time.time()
  print("Loading training images...")

  training_data = []
  faces_path = os.path.join(DATA_PATH,'face_images')
  for filename in tqdm(os.listdir(faces_path)):
      path = os.path.join(faces_path,filename)
      image = Image.open(path).resize((GENERATE_SQUARE,
            GENERATE_SQUARE),Image.ANTIALIAS)
      training_data.append(np.asarray(image))
  training_data = np.reshape(training_data,(-1,GENERATE_SQUARE,
            GENERATE_SQUARE,IMAGE_CHANNELS))
  training_data = training_data.astype(np.float32)
  training_data = training_data / 127.5 - 1.


  print("Saving training image binary...")
  np.save(training_binary_path,training_data)
  elapsed = time.time()-start
  print (f'Image preprocess time: {hms_string(elapsed)}')
else:
  print("Loading previous training pickle...")
  training_data = np.load(training_binary_path)

In [None]:
# Batch and shuffle the data
train_dataset = tf.data.Dataset.from_tensor_slices(training_data) \
    .shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [None]:
def build_generator(seed_size, channels):
    model = Sequential()

    model.add(Dense(4*4*256,activation="relu",input_dim=seed_size))
    model.add(Reshape((4,4,256)))

    model.add(UpSampling2D())
    model.add(Conv2D(256,kernel_size=3,padding="same"))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Activation("relu"))

    model.add(UpSampling2D())
    model.add(Conv2D(256,kernel_size=3,padding="same"))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Activation("relu"))
   
    # Output resolution, additional upsampling
    model.add(UpSampling2D())
    model.add(Conv2D(128,kernel_size=3,padding="same"))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Activation("relu"))

    if GENERATE_RES>1:
      model.add(UpSampling2D(size=(GENERATE_RES,GENERATE_RES)))
      model.add(Conv2D(128,kernel_size=3,padding="same"))
      model.add(BatchNormalization(momentum=0.8))
      model.add(Activation("relu"))

    # Final CNN layer
    model.add(Conv2D(channels,kernel_size=3,padding="same"))
    model.add(Activation("tanh"))

    return model


def build_discriminator(image_shape):
    model = Sequential()

    model.add(Conv2D(32, kernel_size=3, strides=2, input_shape=image_shape, 
                     padding="same"))
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dropout(0.25))
    model.add(Conv2D(64, kernel_size=3, strides=2, padding="same"))
    model.add(ZeroPadding2D(padding=((0,1),(0,1))))
    model.add(BatchNormalization(momentum=0.8))
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dropout(0.25))
    model.add(Conv2D(128, kernel_size=3, strides=2, padding="same"))
    model.add(BatchNormalization(momentum=0.8))
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dropout(0.25))
    model.add(Conv2D(256, kernel_size=3, strides=1, padding="same"))
    model.add(BatchNormalization(momentum=0.8))
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dropout(0.25))
    model.add(Conv2D(512, kernel_size=3, strides=1, padding="same"))
    model.add(BatchNormalization(momentum=0.8))
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    return model