In [None]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


from collections import defaultdict

In [None]:
%%time
train_df=pd.read_csv('/kaggle/input/champs-scalar-coupling/train.csv')
structures_df=pd.read_csv('../input/champs-scalar-coupling/structures.csv')
structures_df=structures_df.sort_values(['molecule_name', 'atom_index'])

train_df.head()

In [None]:
structures_df.head()

In [None]:
print('Number Of Molecules:', train_df.molecule_name.nunique())
print('Number Of Coupling Types:', train_df.type.nunique())

print('Number Of Atoms:', structures_df.atom.nunique())
print('Number Of Atom Index:', structures_df.atom_index.nunique())
print('Number Of Training Records:', len(train_df))

In [None]:
train_df.type.value_counts()

In [None]:
structures_df.sample()

# Number Of the Atoms in the molecule

In [None]:
atom_count_df=structures_df.groupby('molecule_name')[['atom_index']].count().rename(columns={'atom_index': 'num_atoms'}).reset_index()
atom_count_df.head()

In [None]:
plt.figure(figsize=(12, 5))
plt.title('Number Of Atoms (vs) Number Of Molecules')
sns.countplot(data=atom_count_df, 
              x='num_atoms')

plt.xlabel('Number Of Atoms')
plt.ylabel('Number Of Molecules')
plt.show()

In [None]:
sns.boxplot(data=atom_count_df, x='num_atoms')
plt.xlabel('Number Of Atoms')
plt.show()

Is there any impact of the number of atoms in the molecules with the coupling constants

In [None]:
coupling_agg_df=train_df.groupby('molecule_name')[['scalar_coupling_constant']].agg(['mean', 'min', 'max'])
coupling_agg_df.columns=['coupling_mean', 'coupling_min', 'coupling_max']
coupling_agg_df=coupling_agg_df.reset_index()

In [None]:
coupling_agg_df.head()

In [None]:
coupling_agg_df=coupling_agg_df.merge(atom_count_df)
coupling_agg_df.head()

In [None]:
plt.figure(figsize=(10, 5))
plt.title('Distribution of Mean Coupling Constant with Number Of Atoms')
sns.boxplot(data=coupling_agg_df, x='num_atoms', y='coupling_mean')

plt.xlabel('Number Of Atoms')
plt.ylabel('Coupling Constant Mean')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.title('Distribution of Minimum Coupling Constant with Number Of Atoms')
sns.boxplot(data=coupling_agg_df, x='num_atoms', y='coupling_min')

plt.xlabel('Number Of Atoms')
plt.ylabel('Coupling Constant Minimum')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.title('Distribution of Maximum Coupling Constant with Number Of Atoms')
sns.boxplot(data=coupling_agg_df, x='num_atoms', y='coupling_max')

plt.xlabel('Number Of Atoms')
plt.ylabel('Coupling Constant Maximum')
plt.show()

1. Number Of Atoms in the molecule had an impact on the Mean of the coupling constants of the given atoms in molecule.
2. Mean Coupling is reduced as the number of Atoms increase in the molecule. It could be because of some of the bonds could be unstable.


# Impact of the Type of Coupling

In [None]:
train_df.head()

In [None]:
train_df.type.value_counts()

In [None]:
plt.figure(figsize=(9, 5))
sns.boxplot(data=train_df.sort_values('type'), 
            x='type',
            y='scalar_coupling_constant'
           )

plt.xlabel('Coupling Type')
plt.ylabel('Scalar Coupling Constant')
plt.show()

In [None]:
type_agg_df=train_df.groupby('type')[['scalar_coupling_constant']].agg('std').reset_index()
type_agg_df.head()

In [None]:
sns.barplot(data=type_agg_df,
            x='type',
            y='scalar_coupling_constant'
           )

Scalar Coupling Constant reduces from 1 -> 3

1JHC, 1JHN have high coupling constants

3JHN --> very less amount of variation in the coupling constants and around zeros.

# Lets us check the impact of the distances on the atoms

In [None]:
%%time
coordinate_df=structures_df.groupby('molecule_name')[['x', 'y', 'z']].agg(list).reset_index()
coordinate_df=train_df.merge(coordinate_df)

coordinate_df.head()

In [None]:
def get_coordinate_x0(row):
    atom_idx=row['atom_index_0']
    return row['x'][atom_idx]

def get_coordinate_x1(row):
    atom_idx=row['atom_index_1']
    return row['x'][atom_idx]


def get_coordinate_y0(row):
    atom_idx=row['atom_index_0']
    return row['y'][atom_idx]

def get_coordinate_y1(row):
    atom_idx=row['atom_index_1']
    return row['y'][atom_idx]

def get_coordinate_z0(row):
    atom_idx=row['atom_index_0']
    return row['z'][atom_idx]

def get_coordinate_z1(row):
    atom_idx=row['atom_index_1']
    return row['z'][atom_idx]

In [None]:
%%time
coordinate_df['x0']=coordinate_df[['atom_index_0', 'x']].apply(get_coordinate_x0, axis=1)
coordinate_df['x1']=coordinate_df[['atom_index_1', 'x']].apply(get_coordinate_x1, axis=1)

coordinate_df['y0']=coordinate_df[['atom_index_0', 'y']].apply(get_coordinate_y0, axis=1)
coordinate_df['y1']=coordinate_df[['atom_index_1', 'y']].apply(get_coordinate_y1, axis=1)

coordinate_df['z0']=coordinate_df[['atom_index_0', 'z']].apply(get_coordinate_z0, axis=1)
coordinate_df['z1']=coordinate_df[['atom_index_1', 'z']].apply(get_coordinate_z1, axis=1)

coordinate_df.head()

In [None]:
%%time

x0=coordinate_df['x0'].values
x1=coordinate_df['x1'].values

y0=coordinate_df['y0'].values
y1=coordinate_df['y1'].values

z0=coordinate_df['z0'].values
z1=coordinate_df['z1'].values

dist=(x0-x1)**2 + (y0-y1)**2+ (z0-z1)**2

coordinate_df['dist']=dist
coordinate_df=coordinate_df[['type', 'dist', 'scalar_coupling_constant']]


In [None]:
coordinate_df.head()

In [None]:
sns.displot(data=coordinate_df, x='dist')
plt.xlabel('Distance Between Atoms')
plt.show()

In [None]:
coordinate_df.head()

In [None]:
sns.boxplot(data=coordinate_df,
            x='type',
            y='dist'
           )

In [None]:
sns.displot(data=coordinate_df,
            x='dist',
            y='scalar_coupling_constant',
            col='type',
            hue='type',
            facet_kws={}
           )

1. Distance along with the coupling Type had the impact on the scalar coupling constants
2. 1JHC, 1JHN -> had less distance and have higher coulding constants.
3. 2JHH, 2JHN, 2JHC -> have distances in range (2-8) and have negative coulings
4. 3JHC, 3JHN, 3JHH -> have very less variation in coupling constant and have higher variation in the distances between atoms described.