# Importer des libraire

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib widget

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, learning_curve, train_test_split
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, classification_report

# Analyser les données

In [None]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
sub = pd.read_csv("./data/sample_submission.csv")
structures = pd.read_csv("./data/structures.csv")

In [None]:
print("Nombre des molécules (train): ",train['molecule_name'].nunique())
print("************************\n")

print("Nombre des molécules (test): ",test['molecule_name'].nunique())
print("************************\n")

print("Atome : ", structures['atom'].nunique())
print(structures.atom.value_counts(), "\n************************\n")

print("Couplage : ",train['type'].nunique())
print(train.type.value_counts(), "\n************************")

# Calculer le nombre des atomes dans une molécule

In [None]:
structures1 = structures.groupby(['molecule_name']).size().reset_index(name="num_atom")

In [None]:
structures2 = pd.merge(structures["molecule_name"], pd.get_dummies(structures["atom"]),left_index=True, right_index=True).groupby("molecule_name").sum().reset_index()
structures2 = structures2.rename(columns={'C': 'num_C',
                        'F': 'num_F',
                        'H': 'num_H',
                        'N': 'num_N',
                        'O': 'num_O'})

# Calculer la distance des atomes

In [None]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = pd.merge(train, structures1, left_on = 'molecule_name', right_on = 'molecule_name')
train = pd.merge(train, structures2, left_on = 'molecule_name', right_on = 'molecule_name')
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)
train = train.drop("id",axis=1)

test = pd.merge(test, structures1, left_on = 'molecule_name', right_on = 'molecule_name')
test = pd.merge(test, structures2, left_on = 'molecule_name', right_on = 'molecule_name')
test = map_atom_info(test, 0)
test = map_atom_info(test, 1)
test = test.drop("id",axis=1)

In [None]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values


In [None]:
train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)

train['dist_x'] = np.abs(train['x_0'] - train['x_1'])
test['dist_x'] = np.abs(test['x_0'] - test['x_1'])
train['dist_y'] = np.abs(train['y_0'] - train['y_1'])
test['dist_y'] = np.abs(test['y_0'] - test['y_1'])
train['dist_z'] = np.abs(train['z_0'] - train['z_1'])
test['dist_z'] = np.abs(test['z_0'] - test['z_1'])

# Label Encoder

In [None]:
train.type.value_counts()

In [None]:
train.atom_0.value_counts()

In [None]:
train.atom_1.value_counts()

In [None]:
type = ['2JHH','3JHH','1JHC','2JHC','3JHC','1JHN','2JHN','3JHN']

def labelencoder(df):
    le0 = LabelEncoder()
    df["atom_0"] = le0.fit_transform(df["atom_0"])

    le1 = LabelEncoder()
    le1 = le1.fit(['H','C','N'])
    df["atom_1"] = le1.transform(df["atom_1"])

    le2 = LabelEncoder()
    le2 = le2.fit(['2JHH','3JHH','1JHC','2JHC','3JHC','1JHN','2JHN','3JHN'])
    df["type"] = le2.transform(df["type"])
    
    return df

In [None]:
train = labelencoder(train)
test = labelencoder(test)

# Analyser la corrélation

In [None]:
tabcorr = train.corr()

In [None]:
correlations = tabcorr.scalar_coupling_constant
print(abs(correlations.drop(['scalar_coupling_constant'],axis=0)).sort_values(ascending=False))

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(abs(tabcorr), cmap="coolwarm")

In [None]:
plt.figure(figsize=(15,10))
sns.kdeplot(x="scalar_coupling_constant",data=train,hue="type",shade=True, palette=sns.color_palette("Paired",8))
plt.xlabel("Coupling constant",fontdict={'size': 15})
plt.ylabel("Density",fontdict={'size': 15})
plt.legend(labels=type, title = "Type", prop={'size': 12}, title_fontsize = "15")

In [None]:
plt.figure(figsize=(15,10))
sns.kdeplot(x="scalar_coupling_constant",data=train,shade=True)
plt.xlabel("Coupling constant",fontdict={'size': 15})
plt.ylabel("Density",fontdict={'size': 15})

In [None]:
plt.figure(figsize=(15,10))
sns.kdeplot(x="dist",data=train,hue="type",shade=True, palette=sns.color_palette("Paired",8))
plt.xlabel("Distance",fontdict={'size': 15})
plt.ylabel("Density",fontdict={'size': 15})
plt.legend(labels=type, title = "Type", prop={'size': 12}, title_fontsize = "15")

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x="dist", y="scalar_coupling_constant", data=train, hue="type", palette=sns.color_palette("Paired", 8))
plt.xlabel("Distance",fontdict={'size': 15})
plt.ylabel("Coupling constant",fontdict={'size': 15})
plt.legend(labels=type, title = "Type", prop={'size': 12}, title_fontsize = "15")

In [None]:
plt.figure(figsize=(15,10))
ax=sns.boxplot(x="type", y="scalar_coupling_constant", data=train, palette=sns.color_palette("Paired", 8))
ax.set_xticklabels(labels=type)
plt.xlabel("Coupling type",fontdict={'size': 15})
plt.ylabel("Coupling constant",fontdict={'size': 15})
#plt.legend(labels=type, title = "Type", prop={'size': 12}, title_fontsize = "15")

# Machine learning

## Fonctions

In [None]:
def plot_learning_curve(est, X_train, y_train) :
    train_sizes, train_scores, test_scores = learning_curve(estimator=est, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10),
                                                        cv=5,
                                                        n_jobs=-1)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(8,10))
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
    plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
    plt.grid(visible='on')
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.2, 1.0])
    plt.show()

In [None]:
def plot_compare2y(y_name,y_test,y_pred):
    plt.figure(figsize=(8,8))
    plt.scatter(y_test, y_pred)
    plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
    plt.xlabel("Real value",fontdict={'size': 12})
    plt.ylabel("Prediction",fontdict={'size': 12})
    plt.title(y_name,fontdict={'size': 12})
    plt.show()

In [None]:
def plot_scores(train) :
    accuracy = train.history['accuracy']
    val_accuracy = train.history['val_accuracy']
    epochs = range(len(accuracy))
    plt.plot(epochs, accuracy, 'b', label='Score apprentissage')
    plt.plot(epochs, val_accuracy, 'r', label='Score validation')
    plt.xlabel("epoch")
    plt.title('Scores')
    plt.legend()
    plt.show()

## X, y

In [None]:
#train_s = train.sample(frac=0.1)

X = train.drop(['scalar_coupling_constant', 'atom_0', 'molecule_name'],axis=1)
y = train['scalar_coupling_constant']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
X.columns

## Random Forest

In [None]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
importances_rf = rf.feature_importances_
indices_rf = np.argsort(importances_rf)

plt.figure(figsize=(12,9))
plt.barh(range(len(indices_rf)), importances_rf[indices_rf])
plt.yticks(range(len(indices_rf)), X_train.columns[indices_rf])
plt.title("Importance des caracteristiques (Random Forest)")
plt.show()

In [None]:
y_pred = rf.predict(X_test)

In [None]:
plot_compare2y(y_name="Coupling Constant [Random Forest] : "+str(r2_score(y_true=y_test, y_pred=y_pred)),y_test=y_test,y_pred=y_pred)

In [None]:
for i,t in enumerate(type):
    y_test_tt = y_test[X_test['type'] == i]
    y_pred_tt = y_pred[X_test['type'] == i]
    plot_compare2y(y_name=t+" & Coupling Constant [Random Forest] : "+str(r2_score(y_true=y_test_tt, y_pred=y_pred_tt)),y_test=y_test_tt,y_pred=y_pred_tt)

## Random Forest (selon type)

In [None]:
train_t, X_t, y_t, X_train_t, X_test_t, y_train_t, y_test_t, y_pred_t = [],[],[],[],[],[],[],[]
for i,t in enumerate(type):
    train_t.append(train[train['type'] == i])
    X_t.append(train_t[i].drop(['scalar_coupling_constant', 'atom_0', 'molecule_name'],axis=1))
    y_t.append(train_t[i]['scalar_coupling_constant'])
    X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(X_t[i], y_t[i], test_size=0.1, random_state=1)
    X_train_t.append(X_train_tmp)
    X_test_t.append(X_test_tmp)
    y_train_t.append(y_train_tmp)
    y_test_t.append(y_test_tmp)

In [None]:
rf_t = [RandomForestRegressor(n_jobs=-1)]*8
for i,t in enumerate(type):
    print(type[i])
    rf_t[i].fit(X_train_t[i], y_train_t[i])
    y_pred_t.append(rf_t[i].predict(X_test_t[i]))
    plot_compare2y(y_name="Coupling Constant [Random Forest & type] : "+str(r2_score(y_true=y_test_t[i], y_pred=y_pred_t[i])),y_test=y_test_t[i],y_pred=y_pred_t[i])
    print()

## Random Forest (moins X)

In [None]:
X_m = train[['type','dist','atom_1']]

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y, test_size=0.1, random_state=1)

In [None]:
rf_m = RandomForestRegressor(n_jobs=-1)
rf_m.fit(X_train_m, y_train_m)

In [None]:
y_pred_m = rf_m.predict(X_test_m)

In [None]:
plot_compare2y(y_name="Coupling Constant [Random Forest & moins X] : "+str(r2_score(y_true=y_test_m, y_pred=y_pred_m)),y_test=y_test_m,y_pred=y_pred_m)

In [None]:
for i,t in enumerate(type):
    y_test_mt = y_test_m[X_test_m['type'] == i]
    y_pred_mt = y_pred_m[X_test_m['type'] == i]
    plot_compare2y(y_name=t+" & Coupling Constant [Random Forest & moins X] : "+str(r2_score(y_true=y_test_mt, y_pred=y_pred_mt)),y_test=y_test_mt,y_pred=y_pred_mt)

## Couche dense

In [None]:
import tensorflow as tf 
from tensorflow import keras

keras.backend.set_floatx('float64')

import json

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
learning_rate = 0.002
coef1 = 1e-5
activation = 'tanh'
loss = "mse"
path = "./model/model2"
optimizer = keras.optimizers.Adam(learning_rate)
EPOCHS = 100

model_reg = keras.Sequential([
    keras.Input(shape=(20,), name='layer_in'),
    keras.layers.Dense(14, activation=activation, name='layer_1', kernel_regularizer=keras.regularizers.l1(coef1)),
    keras.layers.Dense(14, activation=activation, name='layer_2', kernel_regularizer=keras.regularizers.l1(coef1)),
    keras.layers.Dense(14, activation=activation, name='layer_3', kernel_regularizer=keras.regularizers.l1(coef1)),
    keras.layers.Dense(1, name='layer_out')
    ], name='model_reg')
model_reg.compile(loss=loss, optimizer=optimizer, metrics=['mae', 'mse'])

model_reg.summary()

In [None]:
checkpoint_path = path + "/checkpoint/model_reg.ckpt"
''' ### if exists saved model load it
    if os.path.exists(checkpoint_path + '.index'):
        print('----------------load the model-------------------')
        model_reg.load_weights(checkpoint_path)
'''

class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0 and epoch != 0:
            print()
        if epoch % 20 == 0 and epoch != 0:
            print()
        if epoch % 1 == 0:
            print('.', end=' ')

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True)

physical_devices = tf.config.experimental.list_physical_devices("GPU")
logical_devices = tf.config.experimental.list_logical_devices("GPU")

In [None]:
history_reg = model_reg.fit(X_train, y_train, epochs=EPOCHS,
                    validation_split = 0.1, verbose=0, 
                    callbacks=[early_stop,cp_callback, PrintDot()])

In [None]:
with open(path + "/history.json", "w") as json_file:
    json.dump(history_reg.history, json_file)

model_reg.save(path + "/model_reg.h5")

In [None]:
model_reg = tf.keras.models.load_model("./model/model2/model_reg.h5")

In [None]:
model_reg.evaluate(X_test, y_test)

In [None]:
y_pred_tf = model_reg.predict(X_test)
plot_compare2y(y_name="Coupling Constant [Dense layer] : "+str(r2_score(y_true=y_test, y_pred=y_pred_tf)),y_test=y_test,y_pred=y_pred_tf)

In [None]:
for i,t in enumerate(type):
    y_test_tt = y_test[X_test['type'] == i]
    y_pred_tt = y_pred_tf[X_test['type'] == i]
    plot_compare2y(y_name=t+" & Coupling Constant [Dense layer] : "+str(r2_score(y_true=y_test_tt, y_pred=y_pred_tt)),y_test=y_test_tt,y_pred=y_pred_tt)

## Random Forest Classifier

In [None]:
y_c1 = pd.cut(y, bins=100)

In [None]:
lec = LabelEncoder()
y_c = lec.fit_transform(y_c1)

In [None]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_c, test_size=0.9, random_state=1)

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_train_c, y_train_c)

In [None]:
y_pred_c = rfc.predict(X_test_c)

In [None]:
accuracy_score(y_test_c, y_pred_c)

In [None]:
plot_compare2y(y_name="Coupling Constant [Random Forest Classifier] : ",y_test=y_test_c,y_pred=y_pred_c)

## Couche dense (classifier)

In [None]:
model5 = keras.Sequential()
model5.add(keras.layers.Dense(100, activation='relu'))
model5.add(keras.layers.Dense(100, activation='relu'))
model5.add(keras.layers.Dense(100, activation='softmax'))

In [None]:
model5.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
train5 = model5.fit(X_train_c , y_train_c , validation_split = 0.1, epochs=100, verbose=1)

In [None]:
model5.evaluate(X_test_c,y_test_c)

In [None]:
plt.figure(figsize=(12,8))
plot_scores(train5)

In [None]:
y_pred_c1 = model5.predict(X_test_c)
plot_compare2y(y_name="Coupling Constant [Random Forest Classifier] : ",y_test=y_test_c,y_pred=y_pred_c1)

In [None]:
path = "./model/model5"

with open(path + "/history.json", "w") as json_file:
    json.dump(train5.history, json_file)

model5.save(path + "/model_reg.h5")

# Test

In [None]:
train['scalar_coupling_constant'].values.min()