<a href="https://colab.research.google.com/github/sandeep1847e/Phy654-data/blob/main/ml_important_code_snippet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# to read and store data from root file
def get_features_labels(file_name):
    # load file
    h5file = tables.open_file(file_name, 'r')
    njets = getattr(h5file.root,features[0]).shape[0]

    # allocate arrays
    feature_array = np.zeros((njets,nfeatures))
    label_array = np.zeros((njets,nlabels))

    # load feature arrays
    for (i, feat) in enumerate(features):
        feature_array[:,i] = getattr(h5file.root,feat)[:]

    # load labels arrays
    for (i, label) in enumerate(labels):
        prods = label.split('*')
        prod0 = prods[0]
        prod1 = prods[1]
        fact0 = getattr(h5file.root,prod0)[:]
        fact1 = getattr(h5file.root,prod1)[:]
        label_array[:,i] = np.multiply(fact0,fact1)

    feature_array = feature_array[np.sum(label_array,axis=1)==1]
    label_array = label_array[np.sum(label_array,axis=1)==1]

    h5file.close()
    print (feature_array)
    print ('\n\n')
    print (label_array)
    return feature_array, label_array


**FROM ASSIGNMENT**

In [None]:
# all imports
# 1 for creating a multi class classification model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler ,LabelEncoder
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report,recall_score
from sklearn.utils import shuffle
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 2


In [None]:
# To download and store data in Dataframe just give the url
import gdown
url = "https://swmukher.web.cern.ch/swmukher/dataset_star.csv"
output = 'star_data.csv' #data will be saved as star_data.csv
gdown.download(url, output, quiet=True)
df=pd.read_csv("star_data.csv")
df.head()

In [None]:
# when there are multiple classes we can use this to store
# a particular class data in one dataframe
type_0 = df[df['Type']==0]

In [None]:
# for plotting histogram
plt.hist((type_0['Temperature']),color='blue',rwidth=1.5,alpha=0.5,
         bins=10,edgecolor='black') # CAN USE density='True'
# here alpha is for transparency and rwidth is for width of the bars
plt.yscale('log') #for scalking

In [None]:
df['Spectral_Class'].unique() #to see unique elements of a feature

In [None]:
df['Spectral_Class'] = LabelEncoder().fit_transform(df['Spectral_Class'])
# to change that non numerical feature to numerical
df['Color'] = df['Color'].str.lower()
# to change from upper to lower value
# to standarize the input features
Ifeatures=['Temperature','L','R','A_M','Color','Spectral_Class']
df[Ifeatures]=StandardScaler().fit_transform(df[Ifeatures])
# one-hots encoding
Y=to_categorical(Y,num_classes=6)

In [None]:
# splitting data using sklearn
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2,random_state=32)

In [None]:
# fully connected layer
model=Sequential()
model.add(Dense(32,input_dim=Xtrain.shape[1],activation='relu')) #input layer
model.add(Dense(16,activation='relu')) #hidden layer
model.add(Dense(6,activation='softmax')) #output layer

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy','recall'])
history=model.fit(Xtrain,Ytrain,epochs=100,batch_size=64,validation_split=0.20,verbose=2)

In [None]:
 # predicting on test data
Ypred = model.predict(Xtest)
Ypred_classes=np.argmax(Ypred,axis=1)
Ytrue_classes=np.argmax(Ytest,axis=1)
correct=Ypred_classes == Ytrue_classes

ConfusionMatrix=confusion_matrix(Ytrue_classes,Ypred_classes)
import seaborn as sns
sns.heatmap(ConfusionMatrix,annot=True,fmt='d',cmap='Blues')

accuracy=accuracy_score(Ytrue_classes,Ypred_classes) #for accuracy score
plt.plot(history.history['loss'], label='Train Loss')

**FROM HBB**

In [None]:
import keras
import numpy as np
import tables
from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
# to select only rows wihch are signal and only background in a new array respectively
f_sig = feature_array[y==1]
f_bkg = feature_array[y==0]

In [None]:
# FULLY CONNECTED MODEL
inputs = Input(shape=(nfeatures,), name = 'input')
x = Dense(64, name = 'dense_1', activation='relu')(inputs)
x = Dense(32, name = 'dense_2', activation='relu')(x)
x = Dense(32, name = 'dense_3', activation='relu')(x)
outputs = Dense(nlabels, name = 'output', activation='softmax')(x)
keras_model = Model(inputs=inputs, outputs=outputs)
keras_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(keras_model.summary())

In [None]:
# define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('keras_model_best.keras', monitor='val_loss', save_best_only=True)
callbacks = [early_stopping, model_checkpoint]

# fit keras model
keras_model.fit(feature_array, label_array, batch_size=1024,
                epochs=100, validation_split=0.2, shuffle=False,
                callbacks = callbacks)

In [None]:
# reload best weights
keras_model.load_weights('keras_model_best.keras')

# run model inference on test data set
predict_array_test = keras_model.predict(feature_array_test)

# create ROC curve
fpr, tpr, threshold = roc_curve(label_array_test[:,1], predict_array_test[:,1])

# plot ROC curve
plt.figure()
plt.plot(fpr, tpr, lw=2.5, label="AUC = {:.1f}%".format(auc(fpr,tpr)*100))
plt.ylabel(r'True positive rate')
plt.xlabel(r'False positive rate')
#plt.semilogy()
plt.ylim(0.001,1)
plt.xlim(0,1)
plt.grid(True)



**FITTING PARAMETERS**

In [None]:
import numpy as np
import matplotlib.pyplot as plt # for plotting
from IPython.display import clear_output
from time import sleep


In [None]:
# define ftrue(x)
# define ftheta(theta,x)
# define fthetagrad(theta,x)
# create sample points of x
# Get randomly sampled x values
def samples(nsamples,width):
    return(width*np.random.randn(nsamples))

# cost function
def get_avg_cost(theta0s,theta1s,nsamples, width):
    n0=len(theta0s)
    n1=len(theta1s)
    C=np.zeros([n0,n1])
    for j0 in range(n0):
        for j1 in range(n1):
            theta=np.array([theta0s[j0],theta1s[j1]])
            x=samples(nsamples,width)
            C[j0,j1]=0.5*np.average((f(theta,x)-true_f(x))**2)
    return(C)


# Plot the cost function landscape
theta0s=np.linspace(-3,6,40)
theta1s=np.linspace(-2,3,40)
print (len(theta0s))
print (len(theta1s))
C=get_avg_cost(theta0s,theta1s,10000, 2.)
nlevels=20
X,Y=np.meshgrid(theta0s,theta1s,indexing='ij')
plt.contourf(X,Y,C,nlevels)
#plt.contour(X,Y,C,nlevels,colors="white")
plt.xlabel("theta_0")
plt.ylabel("theta_1")
plt.colorbar()
plt.show()
#print (C)

# Gradient descent
# take arbitrary parameters as starting point in a given range
theta0_range = 1.
theta1_range = 2.
theta=np.array([theta0_range,theta1_range])*np.random.rand(2)
print (theta)
alpha=0.2 # "learning rate" (gradient descent step size)
nsamples=50
nsteps=90

x_sweep=np.linspace(-4,4,300)
xrange = 2.

for n in range(nsteps):

    x=samples(nsamples, xrange) # get random samples

    # deviation from true function (vector):
    deviation=f(theta,x)-true_f(x)

    # do one gradient descent step:
    theta -= alpha*np.average(deviation[None,:]*f_grad(theta,x),axis=1)

    # Now: Plotting
    # compare true function (blue) against
    # parametrized function (orange)
    # blue dots indicate random points where
    # the true function was sampled in this step

    clear_output(wait=True)
    fig,ax=plt.subplots(ncols=2,nrows=1,figsize=(8,2))

    nlevels=20
    ax[0].contourf(X,Y,C,nlevels)
    ax[0].contour(X,Y,C,nlevels,colors="white")
    ax[0].scatter([theta[0]],[theta[1]],color="orange")
    ax[0].set_xlim(theta0s[0],theta0s[-1])
    ax[0].set_ylim(theta1s[0],theta1s[-1])
    ax[0].set_xlabel("theta_0")
    ax[0].set_ylabel("theta_1")

    ax[1].plot(x_sweep,true_f(x_sweep),color="blue")
    ax[1].scatter(x,true_f(x),color="blue")
    ax[1].plot(x_sweep,f(theta,x_sweep),color="orange")
    ax[1].set_xlim(-4,4)
    ax[1].set_ylim(0.0,4.0)
    ax[1].set_xlabel("x")
    ax[1].set_ylabel("f")

    plt.show()
    sleep(0.3)

    print(theta) #print the final fitted values

**HZZ**

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
# plot loss vs epoch
plt.figure(figsize=(15, 10))
ax = plt.subplot(2, 2, 1)
ax.plot(history.history["loss"], label="loss")
ax.plot(history.history["val_loss"], label="val_loss")
ax.legend(loc="upper right")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")

# plot accuracy vs epoch
ax = plt.subplot(2, 2, 2)
ax.plot(history.history["accuracy"], label="acc")
ax.plot(history.history["val_accuracy"], label="val_acc")
ax.legend(loc="lower right")
ax.set_xlabel("epoch")
ax.set_ylabel("acc")

# Plot ROC
Y_predict = model.predict(X_test)
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(Y_test, Y_predict)
roc_auc = auc(fpr, tpr)
ax = plt.subplot(2, 2, 3)
ax.plot(fpr, tpr, lw=2, color="cyan", label="auc = %.3f" % (roc_auc))
ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="k", label="random chance")
ax.set_xlim([0, 1.0])
ax.set_ylim([0, 1.1])
ax.set_xlabel("false positive rate")
ax.set_ylabel("true positive rate")
ax.set_title("receiver operating curve")
ax.legend(loc="lower right")
plt.show()