In [2]:
import pandas as pd
import tensorflow as tf
import time
from scipy.stats import pearsonr
import pyarrow
import gc

## Reading in the data

In [2]:
start = time.time()
df = pd.read_parquet('Data/train_low_mem.parquet', engine='pyarrow')
end = time.time()
print('Time to read in the data: ', end - start)
df.info()

Time to read in the data:  17.1095929145813
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141410 entries, 0 to 3141409
Columns: 304 entries, row_id to f_299
dtypes: float32(301), object(1), uint16(2)
memory usage: 3.6+ GB


## Splitting the data into training and testing sets

In [3]:
%%time
col_loc_list = df.columns.tolist()[4:]
training_data = df.iloc[:int(len(df)*0.8),:]
training_params = training_data[col_loc_list]
training_target = training_data.target
del training_data

CPU times: user 1.33 s, sys: 967 ms, total: 2.3 s
Wall time: 2.29 s


In [4]:
%%time
testing_data = df.iloc[int(len(df)*0.8):,:]
testing_params = testing_data[col_loc_list]
testing_target = testing_data.target
del testing_data
del df
gc.collect()

CPU times: user 546 ms, sys: 607 ms, total: 1.15 s
Wall time: 1.17 s


20

## Neural Network

In [5]:
def pearson_r_defined(y_true, y_pred):
    print('y_true: ', y_true.eval())
    return 1 - pearsonr(y_true.eval(), y_pred.eval())[0]

In [9]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')

In [10]:
pearson_list = []
for i in range(0, 1000):
    model.fit(training_params.values, training_target.values, epochs=1, verbose=1, batch_size=256)
    predictions = model.predict(testing_params)
    coef = pearsonr(testing_target, predictions)[0]
    print("Pearson Correlation Coefficient:", coef)

Pearson Correlation Coefficient: [0.12259781169859874]


KeyboardInterrupt: 

In [11]:
del model
gc.collect()

5292