# Trial of our features

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/5e/49/b95c037b717b4ceadc76b6e164603471225c27052d1611d5a2e832757945/xgboost-0.90-py2.py3-none-win_amd64.whl (18.3MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.90


In [4]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn import metrics
from sklearn import linear_model
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.display import HTML

import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
import datetime
import json
import gc

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn import metrics


In [7]:
train = pd.read_csv(r"C:\Users\xamir\Documents\UROP19\DataSets\train.csv")
test = pd.read_csv(r"C:\Users\xamir\Documents\UROP19\DataSets\test.csv")

In [10]:
# libraries required
import pandas as pd
import numpy as np
from tqdm import tqdm


def reduce_memory(df, verbose=True):
    """
    :param: df - dataframe required to decrease the memory usage
    :param: verbose - show logging output if 'Ture'

    Goal: Reduce the memory usage by decreasing the type of the value if applicable

    Return: original dataframe with lower memory usage
    """

    numerics = ['int64', 'int16', 'int32', 'float64', 'float32', 'float16']
    start_memory = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_memory = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_memory, 100 * (start_memory - end_memory) / start_memory))

    return df


def struc1_merge(df1, df2, index):
    """
    :param: df1 - training data
    :param: df2 - structure data after being added electronegativity, radius, bond_lengths, hybridization, surrounding atoms (bonds),
            position info. (x, y, z)
    :param: index - atom_index in the coupling

    Goal: Merge original training dataframe with processed structure data to form a new dataframe for further training process

    Return: Merged dataframe
    """

    struc1_train_merge = pd.merge(df1, df2, how='left',
                                  left_on=['molecule_name', f'atom_index_{index}', f'atom_{index}', f'x_{index}', f'y_{index}', f'z_{index}'],
                                  right_on=['molecule_name', 'atom_index', 'atom', 'x', 'y', 'z'])
    
    struc1_train_merge = struc1_train_merge.drop(['n_bonds'], axis=1)
    
    struc1_train_merge = struc1_train_merge.rename(columns={'EN': f'EN_{index}',
                                                            'RD': f'RD_{index}',
                                                            'bond_lengths': f'bond_lengths_{index}',
                                                            'hybri': f'hybri_{index}',
                                                            'bonds': f'bonds_{index}',
                                                            'pi_bonds': f'pi_bonds_{index}'})
    
    return struc1_train_merge


def n_bonds(structures):
    """
    :param: structures - structure.csv from local data
    
    Goal: Calculate the number of bonds for each molecule.

    Return: Structure dataframe with number of bonds (n_bonds) and lists consisting of indexes of connecting atoms (bonds)
    """

    i_atom = structures['atom_index'].values
    p = structures[['x', 'y', 'z']].values
    p_compare = p
    m = structures['molecule_name'].values
    m_compare = m
    r = structures['RD'].values
    r_compare = r

    source_row = np.arange(len(structures))
    max_atoms = 28

    bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
    bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)

    print('Calculating bonds')

    for i in tqdm(range(max_atoms-1)):
        p_compare = np.roll(p_compare, -1, axis=0)
        m_compare = np.roll(m_compare, -1, axis=0)
        r_compare = np.roll(r_compare, -1, axis=0)

        mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
        dists = np.linalg.norm(p - p_compare, axis=1) * mask
        r_bond = r + r_compare

        bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)

        source_row = source_row
        target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
        target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row

        source_atom = i_atom
        target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
        target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col

        bonds[(source_row, target_atom)] = bond
        bonds[(target_row, source_atom)] = bond
        bond_dists[(source_row, target_atom)] = dists
        bond_dists[(target_row, source_atom)] = dists

    bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
    bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
    bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
    bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col

    print('Counting and condensing bonds')

    bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
    bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
    n_bonds = [len(x) for x in bonds_numeric]

    #bond_data = {'bond_' + str(i):col for i, col in enumerate(np.transpose(bonds))}
    #bond_data.update({'bonds_numeric':bonds_numeric, 'n_bonds':n_bonds})

    bond_data = {'bonds':bonds_numeric, 'n_bonds':n_bonds, 'bond_lengths':bond_lengths}
    bond_df = pd.DataFrame(bond_data)
    structures = structures.join(bond_df)
    
    return structures


def struc_merge(df, struc, index):
    """
    :param: df - The dataframe to be merged with structure data
    :param: struc - structure data
    :param: index - index of atom in the coupling

    Goal: Merger two dataframe.

    Return: a new dataframe after merged
    """

    # Merge train and structures data based on the atom index
    df_struc = pd.merge(df, struc, how='left', 
                        left_on=['molecule_name', f'atom_index_{index}'], 
                        right_on=['molecule_name', 'atom_index'])

    # Drop the atom index column
    df_struc = df_struc.drop('atom_index', axis=1)

    # Rename the columns
    df_struc = df_struc.rename(columns={'atom': f'atom_{index}',
                                        'x': f'x_{index}',
                                        'y': f'y_{index}',
                                        'z': f'z_{index}'})

    return df_struc


def distance(df, structures):
    """
    :param: df - Data that need to calculate distance

    Goal: Calculate the distance between two spins

    Return: DataFrame with distance added
    """

    # Make a copy of  the data for avoiding changing the original data
    df_copy = df.copy()

    # Merge data
    df_copy = struc_merge(df_copy, structures, 0)
    df_copy = struc_merge(df_copy, structures, 1)

    # This block for calculating the distance between two spins
    df_p_0 = df_copy[['x_0', 'y_0', 'z_0']].values
    df_p_1 = df_copy[['x_1', 'y_1', 'z_1']].values

    df_copy['distance'] = np.linalg.norm(df_p_0 - df_p_1, axis=1)

    return df_copy


def hybridization(structures):
    """
    :param: structures - structures data

    Goal: Calculate each hybridization in the structures data

    Return: structure data with hybridization column added
    """
    
    print('Calculating hybridization....')
    
    # 'C' has different types of hybridizations with different number of bonds.
    # '4' for four bonds
    hybri_dict = {'C': {'4': 3, '3': 2, '2': 2, '1': 0},
                  'N': {'4': 0, '3': 3, '2': 2, '1': 1},
                  'O': {'2': 2, '1': 1},
                  'H': {'1': 0},
                  'F': {'1': 0}}
                # 3 bonds- sp3, 2 - sp2, 1 - sp
    
    hybri = []

    for i in tqdm(range(len(structures))):
        hybri.append(hybri_dict[structures.loc[i, 'atom']][str(structures.loc[i, 'n_bonds'])])
    
    structures['hybri'] = hybri

    return structures


def pi_bonds(structures):
    """
    :param: structures - structures data

    Goal: Calculate the number of pi_bonds for each atom

    Return: structures with pi_bonds column added
    """
    
    print('Calculating pi bonds....')
    
    # The number of atoms connecting to an atom is related with the number of pi bonds.
    # Eg: In 'C', if there are 4 bonds around, then the number of pi bonds is 0.
    pi_bond = {'C': {'4': 0, '2': 2, '3': 1},
               'N': {'4': 0, '3': 0, '2': 1, '1': 2},
               'O': {'1': 1, '2': 0},
               'H': {'1': 0},
               'F': {'1': 0}}

    pi_bond_ = []

    for i in tqdm(range(len(structures))):
        pi_bond_.append(pi_bond[structures.loc[i, 'atom']][str(structures.loc[i, 'n_bonds'])])

    structures['pi_bonds'] = pi_bond_

    return structures


def electronegativity(atom_name, structures):
    """
    :param: atom_name - list or np.ndarray consisting of name of atoms
    :param: structures - structures data

    Goal: Assign an electrinegativity for each atom

    Return: structures with electrineativity column added
    """
    
    electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}
    en_ = [electronegativity[x] for x in tqdm(atom_name)]

    structures['EN'] = en_

    return structures


def radius(atom_name, structures):
    """
    :param: atom_name - list or np.ndarray consisting of name of atoms
    :param: structures - structures data

    Goal: Assign an radius for each atom

    Return: structures with radius column added
    """
    
    atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71} # Without fudge factor

    fudge_factor = 0.05
    atomic_radius = {k:v + fudge_factor for k,v in atomic_radius.items()}
    rd_ = [atomic_radius[x] for x in atom_name]

    structures['RD'] = rd_

    return structures


def map_atom_info(df_1,df_2, atom_idx):
    """
    :param: df_1 - train data
    :param: df_2 - structure data
    :param: atom_ind - atom index in coupling

    Goal: Merge two dataframe for further using

    Return: A new dataframe after merged
    """

    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df


def create_closest(df_train):
    df_temp=df_train.loc[:,["molecule_name","atom_index_0","atom_index_1","distance","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})

    df_temp=pd.concat(objs=[df_temp,df_temp_],axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["distance"]]

    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'distance': 'distance_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})

    for atom_idx in [0,1]:
        df_train = map_atom_info(df_train,df_temp, atom_idx)
        df_train = df_train.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                            'distance_closest': f'distance_closest_{atom_idx}',
                                            'x_closest': f'x_closest_{atom_idx}',
                                            'y_closest': f'y_closest_{atom_idx}',
                                            'z_closest': f'z_closest_{atom_idx}'})
    return df_train


def add_cos_features(df):
    """
    :param: df - dataframe containing necessary data for calculating the cosine value

    Goal: Calculating cosine value

    Return: dataframe with cosine data added
    """

    # The modulus of the 
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    
    # Unit vector along each direction
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    
    # Ratio between the difference along each direction to the distance
    df["vec_x"]=(df['x_1']-df['x_0'])/df["distance"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["distance"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["distance"]

    # Cosine of each component
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]

    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)

    # Angle for each component
    df["Angle"] = df["cos_0_1"].apply(lambda x: np.arccos(x)) * 180 / np.pi
    df["cos_0"] = df["cos_0"].apply(lambda x: np.arccos(x)) * 180 / np.pi
    df["cos_1"] = df["cos_1"].apply(lambda x: np.arccos(x)) * 180 / np.pi

    return df


# File paths
train_path = r"C:\Users\xamir\Documents\UROP19\DataSets\train.csv"
structures_path = r"C:\Users\xamir\Documents\UROP19\DataSets\structures.csv"
test_path = r"C:\Users\xamir\Documents\UROP19\DataSets\test.csv"

# read data from local address
train_df_full = pd.read_csv(train_path, index_col=0)
structures_df_full = pd.read_csv(structures_path, dtype={'atom_index': np.int8})
test_df_full = pd.read_csv(test_path)

# Add distance feature to the test and trin data
train_df = distance(train_df_full, structures_df_full)
test_df = distance(test_df_full, structures_df_full)

# ndarray with names of each atom in the structures csv
atom = structures_df_full['atom'].values

# Add electronegativity and radius colmun to the structures csv
structures = electronegativity(atom, structures_df_full)
structures = radius(atom, structures)

# Add number of bonds and connecting atoms columns
structures = n_bonds(structures)

# Add hybridization column
structures = hybridization(structures)

# Add pi_bonds column
structures = pi_bonds(structures)

# Merge structures data and train data
struc_train = struc1_merge(train_df, structures, 0)
struc_train = struc1_merge(struc_train, structures, 1)

struc_train = struc_train.drop(['atom_index_x', 'atom_x', 'x_x', 'y_x', 'z_x',
                                'atom_index_y', 'atom_y','x_y', 'y_y', 'z_y'], axis=1)

# Add bond angle column
struc_train = create_closest(struc_train)
struc_train = add_cos_features(struc_train)

# The list of type for further training
type_list = list(struc_train['type'].unique())

# Drop the target column for training
y = struc_train['scalar_coupling_constant']
struc_train = struc_train.drop(['scalar_coupling_constant'], axis=1)

# Select features for training
X = struc_train[['molecule_name',
                           'type',
                           'distance',
                           'EN_0',
                           'RD_0',
                           'hybri_0',
                           'pi_bonds_0',
                           'EN_1',
                           'RD_1',
                           'hybri_1',
                           'pi_bonds_1',
                           'Angle']]

struc_test = struc1_merge(test_df, structures, 0)
struc_test = struc1_merge(struc_test, structures, 1)

struc_test = struc_test.drop(['atom_index_x', 'atom_x', 'x_x', 'y_x', 'z_x',
                                'atom_index_y', 'atom_y','x_y', 'y_y', 'z_y'], axis=1)


struc_test = create_closest(struc_test)
struc_test = add_cos_features(struc_test)

# The list of type for further training
type_list = list(struc_test['type'].unique())


# Select features for training
X_test = struc_test[['molecule_name',
                           'type',
                           'distance',
                           'EN_0',
                           'RD_0',
                           'hybri_0',
                           'pi_bonds_0',
                           'EN_1',
                           'RD_1',
                           'hybri_1',
                           'pi_bonds_1',
                           'Angle']]




100%|██████████| 2358657/2358657 [00:00<00:00, 3519178.47it/s]


Calculating bonds


100%|██████████| 27/27 [00:08<00:00,  3.15it/s]


Counting and condensing bonds


100%|██████████| 2358657/2358657 [00:11<00:00, 199912.12it/s]
100%|██████████| 2358657/2358657 [00:14<00:00, 157978.38it/s]


Calculating hybridization....


100%|██████████| 2358657/2358657 [01:32<00:00, 25403.29it/s]


Calculating pi bonds....


100%|██████████| 2358657/2358657 [01:33<00:00, 25194.82it/s]


In [11]:
X.head()

Unnamed: 0,molecule_name,type,distance,EN_0,RD_0,hybri_0,pi_bonds_0,EN_1,RD_1,hybri_1,pi_bonds_1,Angle
0,dsgdb9nsd_000001,1JHC,1.0919530596119,2.2,0.43,0,0,2.55,0.82,3,0,70.52868197808704
1,dsgdb9nsd_000001,2JHH,1.783119756038801,2.2,0.43,0,0,2.2,0.43,0,0,109.46840816718267
2,dsgdb9nsd_000001,2JHH,1.783147496403011,2.2,0.43,0,0,2.2,0.43,0,0,109.47131802191296
3,dsgdb9nsd_000001,2JHH,1.783156685329616,2.2,0.43,0,0,2.2,0.43,0,0,109.47206691423882
4,dsgdb9nsd_000001,1JHC,1.091951618581363,2.2,0.43,0,0,2.55,0.82,3,0,70.52765135814023


In [12]:
X_test.head()

Unnamed: 0,molecule_name,type,distance,EN_0,RD_0,hybri_0,pi_bonds_0,EN_1,RD_1,hybri_1,pi_bonds_1,Angle
0,dsgdb9nsd_000004,2JHC,2.2611780778,2.2,0.43,0,0,2.55,0.82,2,2,0.0
1,dsgdb9nsd_000004,1JHC,1.0620990942,2.2,0.43,0,0,2.55,0.82,2,2,180.0
2,dsgdb9nsd_000004,3JHH,3.323277172,2.2,0.43,0,0,2.2,0.43,0,0,180.0
3,dsgdb9nsd_000004,1JHC,1.0620990942,2.2,0.43,0,0,2.55,0.82,2,2,180.0
4,dsgdb9nsd_000004,2JHC,2.2611780778,2.2,0.43,0,0,2.55,0.82,2,2,0.0


In [22]:
X = X.drop(["molecule_name", "type"], axis = 1)

In [23]:
X_test = X_test.drop(["molecule_name", "type"], axis = 1)

In [24]:

def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [25]:

def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                        'catboost_metric_name': 'MSE',
                        'sklearn_scoring_function': metrics.mean_squared_error}
                    }

        result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros(len(X))
    
    # averaged predictions on train data
    prediction = np.zeros(len(X_test))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = n_estimators, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict
    



IndentationError: unexpected indent (<ipython-input-25-2dd696f68327>, line 33)

In [15]:
params = {'num_leaves': 50,
          'min_child_samples': 79,
          'min_data_in_leaf' : 100,
          'objective': 'regression',
          'max_depth': 9,
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }

In [26]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=False, random_state=0)

In [27]:
y.head()

0    84.807599999999994
1   -11.257000000000000
2   -11.254799999999999
3   -11.254300000000001
4    84.807400000000001
Name: scalar_coupling_constant, dtype: float64

In [28]:
train_model_regression(X, X_test, y, params=params, folds = folds, model_type = 'lgb', eval_metric = 'mae', plot_feature_importance=False,
                      verbose = 500, early_stopping_rounds=200, n_estimators = 50000)

Fold 1 started at Sun Aug  4 15:37:04 2019
Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 1.98878	valid_1's l1: 2.0495
[1000]	training's l1: 1.9779	valid_1's l1: 2.03588
[1500]	training's l1: 1.97299	valid_1's l1: 2.0299
[2000]	training's l1: 1.97016	valid_1's l1: 2.02697
[2500]	training's l1: 1.96815	valid_1's l1: 2.02499
[3000]	training's l1: 1.96649	valid_1's l1: 2.02312
[3500]	training's l1: 1.96543	valid_1's l1: 2.02204
[4000]	training's l1: 1.96433	valid_1's l1: 2.02087
[4500]	training's l1: 1.9635	valid_1's l1: 2.02018
[5000]	training's l1: 1.96268	valid_1's l1: 2.01933
[5500]	training's l1: 1.96197	valid_1's l1: 2.01858
[6000]	training's l1: 1.9613	valid_1's l1: 2.01779
[6500]	training's l1: 1.96086	valid_1's l1: 2.01731
[7000]	training's l1: 1.96045	valid_1's l1: 2.01693
[7500]	training's l1: 1.95996	valid_1's l1: 2.01651
[8000]	training's l1: 1.95943	valid_1's l1: 2.01608
Early stopping, best iteration is:
[8258]	training's l1: 1.95908	val

{'oof': array([ 89.07343006, -10.37144516, -10.37144516, ...,   2.69567762,
          2.42718641, 123.77860968]),
 'prediction': array([ 11.45771247, 196.160502  ,   6.22213519, ...,   4.01569921,
          3.05420496, 122.81882823]),
 'scores': [2.0155685803611187,
  2.208135206984266,
  2.0629710168716135,
  1.9350599810733726,
  1.8661452631891104]}

In [29]:
sub['scalar_coupling_constant'] = X_short_test['prediction']
sub.to_csv('submission_type.csv', index=False)
sub.head()

NameError: name 'X_short_test' is not defined