In [65]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, GlobalMaxPooling1D, Dropout, Activation
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras import models

In [None]:
#demo version control with jupyter notebooks.

In [35]:
from sixty2k_crystals import utils

In [10]:
import urllib
import os
import zipfile

In [24]:
def download_data(directory=None):
    '''
    Downloads the dataset for the 62k crystal analysis.
    
    Parameters
    ----------
    directory : str or None
        Directory where the 'm1507656' file currently lives
        if already downloaded, or None if not already
        downloaded
    
    Returns
    -------
    filename : str
        The absolute filename of the 'df_62k.json' file
        used for the downstream analysis

    '''
    if directory is None:
        directory = os.getcwd()
    else:
        pass
    
    filename = os.path.join(directory, 'm1507656.zip')
    url = 'https://dataserv.ub.tum.de/s/m1507656/download'
    urllib.request.urlretrieve(url, filename)
    
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(directory)
    
    return os.path.join(directory, 'm1507656\\df_62k.json')

In [25]:
def unpack_data(filename):
    '''
    Stores data from the json file into a pandas dataframe
    and preps SMILES strings data for downstream analysis.
    
    Parameters
    ----------
    filename : str
        File location of the data.
    
    Returns
    -------
    df_62k : pandas
        A pandas dataframe of the data
    molecules : list
        A list of the SMILES strings expressed as lists of
        individual characters

    '''
    
    # Unpack data into pandas dataframe
    df_62k = pd.read_json(filename, orient='split')
    
    # Extract SMILES strings
    molecules = df_62k['canonical_smiles'].values
    maxlen = len(max(molecules, key=len))

    molecules = molecules.reshape(-1, 1)
    molecules = molecules.astype(str)

    # Pad strings so all the same length
    molecules = np.char.zfill(molecules, width=maxlen)
    
    # Turns the string array into a list of single characters
    molecules = molecules.tolist()
    molecules2 = [list(x[0]) for x in molecules]
    
    return df_62k, molecules2

In [27]:
filename = download_data(directory=os.getcwd())
df_62k, molecules = unpack_data(filename)

In [30]:
def encoded_smiles(string_array):
    '''
    Convert SMILES Strings into OneHotEncode 2D arrays
    
    Parameters
    -----
    string_array : list
        A list of lists containing SMILES strings broken up into
        indvidual characters
    
    Returns
    -------
    enc : model
        The encoder model used to transform SMILES strings
        into binary representation
    x1 : list
        A list of binary representations of the SMILES strings
        inputs
    
    Examples
    --------
    
    
    '''
    
    enc = OneHotEncoder(handle_unknown='ignore')
    x1 = enc.fit(string_array)
    x1 = x1.transform(string_array).toarray()

    return enc, x1

In [31]:
enc, x1 = encoded_smiles(molecules)

In [40]:
y = df_62k.total_energy_pbe.values

In [41]:
y.shape

(61489,)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(x1, y, test_size=0.33)

In [50]:
batch_size = 128
num_classes = 10
epochs = 12

# input image dimensions
img_rows, img_cols = 5495, 1

In [53]:
X_train.shape

(41197, 5495)

In [59]:
X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols)
X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols)
#y_train.reshape(y_train.shape[0], img_rows, img_cols, 1)
#y_test.reshape(y_test.shape[0], img_rows, img_cols, 1)

In [60]:
X_train.shape

(41197, 5495, 1)

In [63]:
filters = 250
kernel_size = 3
batch_size = 32
epochs = 10
hidden_dims = 250

In [66]:
model = Sequential()

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 41197 samples, validate on 20292 samples
Epoch 1/10

KeyboardInterrupt: 

In [None]:
def create_cnn(width, height, depth, filters=(16, 32, 64), regress=False):
    # initialize the input shape and channel dimension, assuming
    # TensorFlow/channels-last ordering
    inputShape = (height, width, depth)
    chanDim = -1

    # define the model input
    inputs = Input(shape=inputShape)

    # loop over the number of filters
    for (i, f) in enumerate(filters):
        # if this is the first CONV layer then set the input
        # appropriately
        if i == 0:
            x = inputs

        # CONV => RELU => BN => POOL
        x = Conv2D(f, (3, 3), padding="same")(x)
        x = Activation("relu")(x)
        x = BatchNormalization(axis=chanDim)(x)
        x = MaxPooling2D(pool_size=(2, 2))(x)

    # flatten the volume, then FC => RELU => BN => DROPOUT
    x = Flatten()(x)
    x = Dense(16)(x)
    x = Activation("relu")(x)
    x = BatchNormalization(axis=chanDim)(x)
    x = Dropout(0.5)(x)

    # apply another FC layer, this one to match the number of nodes
    # coming out of the MLP
    x = Dense(4)(x)
    x = Activation("relu")(x)

    # check to see if the regression node should be added
    if regress:
        x = Dense(1, activation="linear")(x)

    # construct the CNN
    model = Model(inputs, x)

    # return the CNN
    return model