# FoNS Datathon 2021 

### Looping through all training data to get the best outcome for a given model.

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import *

In [89]:
train_descriptors = pd.read_csv("train_descriptors.csv")
train_mord3d = pd.read_csv("train_mord3d.csv")
train_morgan = pd.read_csv("train_morgan.csv")
train_rdk = pd.read_csv("train_rdk.csv")

train_crystals = pd.read_csv("train_crystals.csv")
train_distances = pd.read_csv("train_distances.csv")
train_centroid_distances = pd.read_csv("train_centroid_distances.csv")

In [90]:
test_descriptors = pd.read_csv("test_descriptors.csv")
test_mord3d = pd.read_csv("test_mord3d.csv")
test_morgan = pd.read_csv("test_morgan.csv")
test_rdk = pd.read_csv("test_rdk.csv")

In [91]:
def preprocess_rdk(train_data, test_data, target):
    
    # Drop first column - no columns should have NA values
    train_data_full = train_data.iloc[:, 1:]
    
    # Make test set have same columns as training set
    test_data_full = test_data[train_data_full.columns]
    
    # Create training and validation sets
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_data_full,
                                                                              target,
                                                                              random_state = 0)
    
    return X_train, X_valid, y_train, y_valid, train_data_full, test_data_full

In [92]:
def preprocess_morgan(train_data, test_data, target): 
    
    # Drop first column - no columns should have NA values
    train_data_full = train_data.iloc[:, 1:]
    
    # Make test set have same columns as training set
    test_data_full = test_data[train_data_full.columns]
    
    # Create training and validation sets
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_data_full,
                                                                              target,
                                                                              random_state = 0)
    
    return X_train, X_valid, y_train, y_valid, train_data_full, test_data_full

In [93]:
def preprocess_mord3d(train_data, test_data, target, pca=False):
    
    
    # Are there any redundant columns (all values the same)? These wouldn't contribute to the model at all
    for col in train_data.columns:
        if len(train_data[col].unique()) == 1:
            # If so, drop redundant columns:
            train_data.drop(col,inplace=True,axis=1)
                               
    # Drop all columns with NA - and any non-numerical data
    train_data_full = train_data.iloc[:, 2:-4].dropna(axis= 1, how="any")
    
    # Make test set have same columns as training set
    test_data_full = test_data[train_data_full.columns]
    
     # Perform PCA
    if pca == True:
        train_PCA = decomposition.PCA(n_components=.95)
        scaler_for_PCA = preprocessing.StandardScaler()
        train_data_full = train_PCA.fit_transform(scaler_for_PCA.fit_transform(train_data_full))
        test_data_full = train_PCA.transform(scaler_for_PCA.transform(test_data_full))
        
        # Create training and validation sets
        X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_data_full,
                                                                              target,
                                                                              random_state = 0)
    
    else:
        # Create training and validation sets
        X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_data_full,
                                                                              target,
                                                                              random_state = 0)

    
    return X_train, X_valid, y_train, y_valid, train_data_full, test_data_full
    

In [94]:
def preprocess_descriptors(train_data, test_data, target, pca=False):
    
    # Are there any redundant columns (all values the same)? These wouldn't contribute to the model at all
    for col in train_data.columns:
        if len(train_data[col].unique()) == 1:
            # If so, drop redundant columns:
            train_data.drop(col,inplace=True,axis=1)
                               
    # Drop all columns with NA - Last 2 columns are InchiKey and SMILES
    train_data_full = train_data.iloc[:, 3:-2].dropna(axis= 1, how="any")
    
    # Make test set have same columns as training set
    test_data_full = test_data[train_data_full.columns]
    
    # Perform PCA
    if pca == True:
        train_PCA = decomposition.PCA(n_components=.95)
        scaler_for_PCA = preprocessing.StandardScaler()
        train_data_full = train_PCA.fit_transform(scaler_for_PCA.fit_transform(train_data_full))
        test_data_full = train_PCA.transform(scaler_for_PCA.transform(test_data_full))
        
        # Create training and validation sets
        X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_data_full,
                                                                              target,
                                                                              random_state = 0)
    
    else:
        # Create training and validation sets
        X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_data_full,
                                                                              target,
                                                                              random_state = 0)

    
    return X_train, X_valid, y_train, y_valid, train_data_full, test_data_full
    

In [95]:
# Define the prediction target
prediction_target = "calculated_density"
target= train_crystals[prediction_target]

In [96]:
# Preprocess all datasets
train_descriptors_list = preprocess_descriptors(train_descriptors,
                                                test_descriptors,
                                                target,
                                                pca=True)

train_mord3d_list = preprocess_mord3d(train_mord3d,
                                     test_mord3d,
                                     target, 
                                     pca=True)

train_morgan_list = preprocess_morgan(train_morgan,
                                     test_morgan,
                                     target)

train_rdk_list = preprocess_rdk(train_rdk,
                               test_rdk,
                               target)

In [97]:
# make a list of all the processed datasets
to_loop = [train_descriptors_list, train_mord3d_list, train_morgan_list,  train_rdk_list]

In [98]:
def loop_descriptors(list_of_descriptor_sets, cv_folds):
    
    '''This function loops through the list of processed datasets and outputs
    the CV score for each one in a list.'''
    
    # Set up an algorithm 
    clf = ensemble.RandomForestRegressor(n_estimators=190, max_depth=27.3442, n_jobs=-1)
    
    # Loop through the datasets and calculate CV score
    cv_scores = []
    for dataset in list_of_descriptor_sets:
            
            clf.fit(dataset[0], dataset[2])
            score = model_selection.cross_val_score(clf, dataset[0],
                                            dataset[2], 
                                            n_jobs=-1,
                                            scoring="neg_mean_absolute_error",
                                            cv=cv_folds).mean()
            cv_scores.append(score)
            
    return cv_scores

In [99]:
loop_descriptors(to_loop, 3)

[-0.06129466782960302,
 -0.08131751394105391,
 -0.08473092666708561,
 -0.1278201188152448]