<a href="https://colab.research.google.com/github/rachidkarakhi/Arduino-RFID-RC522-VB-Net-Interface-MySQL-Database/blob/main/generative1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import the libraries

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention


import the data

In [3]:
import xml.etree.ElementTree as ET
import pandas as pd

def get_data_from_xml_child(child):
    Row = {}
    Row["StudentID"] = child.find("MetaInfo").get('StudentID')
    Row["TaskID"] = child.find("MetaInfo").get('TaskID')
    Row["DataSource"] = child.find("MetaInfo").get('DataSource')
    Row["ProblemDescription"] = child.find("ProblemDescription").text
    Row["Question"] = child.find("Question").text
    Row["Answer"] = child.find("Answer").text
    Row["Annotation_label"] = child.find("Annotation").get("Label")
    Row["Annotation_ContextRequired"] = child.find("Annotation").find("AdditionalAnnotation").get('ContextRequired')
    Row["Annotation_ExtraInfoInAnswer"] = child.find("Annotation").find("AdditionalAnnotation").get('ExtraInfoInAnswer')
    Row["Annotation_comments"] = child.find("Annotation").find("Comments").text
    Row["Annotation_comments_watch"] = child.find("Annotation").find("Comments").get("Watch")
    Row["ReferenceAnswers"] = child.find("ReferenceAnswers").text
    return Row

# xml_data = open('/kaggle/input/asag-dt-grad-xml/grade_data.xml', 'r').read()  # Read file
# Read the file from your Google Drive
with open('/content/drive/MyDrive/Colab Notebooks/grade_data.xml', 'r') as f:
    xml_data = f.read()
root = ET.XML(xml_data)  # Parse XML

data = []
cols = []
for i, child in enumerate(root):
    data.append(get_data_from_xml_child(child))


# New Section

In [4]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,StudentID,TaskID,DataSource,ProblemDescription,Question,Answer,Annotation_label,Annotation_ContextRequired,Annotation_ExtraInfoInAnswer,Annotation_comments,Annotation_comments_watch,ReferenceAnswers
0,DTSU040,LP03_PR09.bLK.sh,DeepTutorSummer2014,"A car windshield collides with a mosquito, squ...",How does Newton's third law apply to this situ...,the windshield will apply a force to the mosqu...,correct(0)|correct_but_incomplete(1)|contradic...,0,0,The student forgot to tell the opposite force...,1,\n1: Since the windshield exerts a force on t...
1,DTSU035,FM_LV04_PR05.sh,DeepTutorSummer2014,Two hockey players pass a puck between them on...,What forces are acting on the puck while the p...,The normal force coming from the ice and the g...,correct(1)|correct_but_incomplete(0)|contradic...,0,1,Indirectly mentioned that the puck moves in a...,1,\n1: The forces acting on the puck while it i...
2,DTSU021,FM_LVxx_PR01,DeepTutorSummer2014,A rocket pushes a meteor with constant force. ...,Can you articulate Newton's second law?,"if there is a zero net force on the object, th...",correct(0)|correct_but_incomplete(0)|contradic...,0,0,"Is this the correct definition, not listed in ...",1,\n1: Newton's 2nd Law says that the net force...
3,DTSU033,LP03_PR09.bLK.sh,DeepTutorSummer2014,"A car windshield collides with a mosquito, squ...",Can you articulate a principle or definition w...,An equal force always balancing it out regardl...,correct(0)|correct_but_incomplete(0)|contradic...,0,0,So difficult to understand. Based on the refe...,1,"\n1: For every action, there is an equal and ..."
4,DTSU015,FM_LV04_PR05,DeepTutorSummer2014,Two hockey players pass a puck between them on...,"Based on Newton's first law, what can you say ...",The speed of the puck will equal to the net fo...,correct(0)|correct_but_incomplete(0)|contradic...,0,0,Related but not correct.,0,\n1: The puck will move in a straight line wi...


In [5]:
df.iloc[0]

StudentID                                                                 DTSU040
TaskID                                                           LP03_PR09.bLK.sh
DataSource                                                    DeepTutorSummer2014
ProblemDescription              A car windshield collides with a mosquito, squ...
Question                        How does Newton's third law apply to this situ...
Answer                          the windshield will apply a force to the mosqu...
Annotation_label                correct(0)|correct_but_incomplete(1)|contradic...
Annotation_ContextRequired                                                      0
Annotation_ExtraInfoInAnswer                                                    0
Annotation_comments              The student forgot to tell the opposite force...
Annotation_comments_watch                                                       1
ReferenceAnswers                \n1:  Since the windshield exerts a force on t...
Name: 0, dtype: 

In [6]:
df.columns

Index(['StudentID', 'TaskID', 'DataSource', 'ProblemDescription', 'Question',
       'Answer', 'Annotation_label', 'Annotation_ContextRequired',
       'Annotation_ExtraInfoInAnswer', 'Annotation_comments',
       'Annotation_comments_watch', 'ReferenceAnswers'],
      dtype='object')

# Combine context, question, and student answer for each sample


In [7]:
input_texts = [item['ProblemDescription'] + " " + item['Question'] + " " + item['Answer'] for item in data]
target_texts = [item['ReferenceAnswers'] for item in data]

In [8]:
size = len(target_texts)
size

898

In [9]:
sizee = len(input_texts)
size

898

# Tokenize the text data

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)
vocab_size = len(tokenizer.word_index) + 1


In [11]:
vocab_size

720

# Convert text sequences to integer sequences and pad sequences

In [12]:
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences_padded = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences_padded = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

# Create training and testing sets

In [13]:
train_size = int(0.8 * len(input_sequences_padded))
x_train, x_test = input_sequences_padded[:train_size], input_sequences_padded[train_size:]
y_train, y_test = target_sequences_padded[:train_size], target_sequences_padded[train_size:]

# Find the maximum sequence length in both input and target sequences

In [14]:
max_input_sequence_length = max(len(seq) for seq in x_train)
max_target_sequence_length = max(len(seq) for seq in y_train)
max_input_sequence_length



142

# Adjust x_train and x_test by removing the last time step


In [15]:
x_train_adjusted = x_train[:, :-1]
x_test_adjusted = x_test[:, :-1]

# Adjust y_train and y_test by removing the last time step and pad the target sequences


In [16]:
y_train_adjusted = pad_sequences(y_train, maxlen=max_target_sequence_length, padding='post')
y_train_adjusted = y_train_adjusted[:, :-1]

y_test_adjusted = pad_sequences(y_test, maxlen=max_target_sequence_length, padding='post')
y_test_adjusted = y_test_adjusted[:, :-1]

# Build the model using functional API


In [17]:
latent_dim = 256  # Number of units in the LSTM layer
embedding_dim = 100  # Dimension of word embeddings

encoder_input = Input(shape=(max_input_sequence_length - 1,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_input)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

decoder_input = Input(shape=(max_target_sequence_length - 1,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_input)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

attention = Attention()([decoder_outputs, encoder_outputs])
decoder_outputs_attention = Dense(vocab_size, activation='softmax')(attention)

model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_outputs_attention)


# Compile the model

In [18]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


# Train the model with adjusted x_train, x_test, y_train, and y_test

---



In [19]:
model.fit([x_train_adjusted, y_train_adjusted], y_train[:, 1:], batch_size=32, epochs=50, validation_data=([x_test_adjusted, y_test_adjusted], y_test[:, 1:]))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7abdc666db10>

In [23]:
# Preprocess input: context and student answer
context = "While speeding up, a large truck pushes a small compact car."
student_answer = "The magnitudes of the forces are equal and opposite to each other due to Newton’s third law of motion."

# Tokenize and pad input sequences for both context and student answer
input_seq_context = tokenizer.texts_to_sequences([context])
input_seq_student_answer = tokenizer.texts_to_sequences([student_answer])

input_seq_context_padded = pad_sequences(input_seq_context, maxlen=max_input_sequence_length - 1, padding='post')
input_seq_student_answer_padded = pad_sequences(input_seq_student_answer, maxlen=max_input_sequence_length - 1, padding='post')

# Generate response using the trained model
generated_token_ids = model.predict([input_seq_context_padded, input_seq_student_answer_padded]).argmax(axis=-1)
generated_response = tokenizer.sequences_to_texts(generated_token_ids)[0]

# Post-process generated response if needed

# Return the generated response as feedback
print("Generated Feedback:", generated_response)


Generated Feedback: equal are force from equal and equal equal equal third are equal equal force from equal equal


In [24]:

pip install nltk




In [25]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

# Extract reference and candidate sequences from your dataset
reference_sequences = input_texts  # List of reference sequences
candidate_sequences = target_texts  # List of candidate sequences


# Prepare reference sequences as list of lists
references = [ref_seq.split() for ref_seq in reference_sequences]

# Prepare candidate sequences as list of strings
candidates = candidate_sequences

# Calculate BLEU score
smoothie = SmoothingFunction().method4  # Choose a smoothing method
bleu_score = corpus_bleu(references, candidates, smoothing_function=smoothie)

print("BLEU Score:", bleu_score)


BLEU Score: 0.054065656969080875


In [26]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

reference_sequences = [['reference_1_seq_1', 'reference_1_seq_2', ...], ['reference_2_seq_1', 'reference_2_seq_2', ...], ...]
candidate_sequence = generated_response

smoothie = SmoothingFunction().method4
bleu_score = sentence_bleu(reference_sequences, candidate_sequence, smoothing_function=smoothie)
print("BLEU Score:", bleu_score)


TypeError: ignored