In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Data Loading

In [None]:

# Load the datasets
summaries_train_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
summaries_test_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
prompts_train_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
prompts_test_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')

# Display the shape of each dataset
data_shapes = {
    "summaries_train_df": summaries_train_df.shape,
    "summaries_test_df": summaries_test_df.shape,
    "prompts_train_df": prompts_train_df.shape,
    "prompts_test_df": prompts_test_df.shape
}

data_shapes


In [None]:
# Check for missing values in each dataset
missing_values = {
    "summaries_train_df": summaries_train_df.isnull().sum(),
    "summaries_test_df": summaries_test_df.isnull().sum(),
    "prompts_train_df": prompts_train_df.isnull().sum(),
    "prompts_test_df": prompts_test_df.isnull().sum()
}

missing_values


Summaries Statistics:

Check for missing values.
Distribution of 'content' and 'wording' scores.
Average summary length.
Prompts Statistics:

Number of unique prompts.
Average length of prompt questions and texts.

### Exploratory Data Analysis

In [None]:
# Summaries Statistics

# Checking for missing values
missing_values = summaries_train_df.isnull().sum()

# Distribution of 'content' and 'wording' scores
content_desc = summaries_train_df['content'].describe()
wording_desc = summaries_train_df['wording'].describe()

# Average summary length
summaries_train_df['summary_length'] = summaries_train_df['text'].apply(len)
avg_summary_length = summaries_train_df['summary_length'].mean()

missing_values, content_desc, wording_desc, avg_summary_length


In [None]:
# Prompts Statistics

# Number of unique prompts
num_unique_prompts = prompts_train_df['prompt_id'].nunique()

# Average length of prompt questions and texts
prompts_train_df['question_length'] = prompts_train_df['prompt_question'].apply(len)
prompts_train_df['text_length'] = prompts_train_df['prompt_text'].apply(len)

avg_question_length = prompts_train_df['question_length'].mean()
avg_text_length = prompts_train_df['text_length'].mean()

num_unique_prompts, avg_question_length, avg_text_length


In [None]:
import matplotlib.pyplot as plt

# Setting up the figure and axes
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

# Plotting the distribution of content scores
ax[0].hist(summaries_train_df['content'], bins=50, color='blue', alpha=0.7)
ax[0].set_title('Distribution of Content Scores')
ax[0].set_xlabel('Content Score')
ax[0].set_ylabel('Number of Summaries')

# Plotting the distribution of wording scores
ax[1].hist(summaries_train_df['wording'], bins=50, color='green', alpha=0.7)
ax[1].set_title('Distribution of Wording Scores')
ax[1].set_xlabel('Wording Score')
ax[1].set_ylabel('Number of Summaries')

plt.tight_layout()
plt.show()


In [None]:
# Calculating word count for each summary
summaries_train_df['word_count'] = summaries_train_df['text'].apply(lambda x: len(str(x).split()))

# Plotting the distribution of word counts
plt.figure(figsize=(10, 6))
plt.hist(summaries_train_df['word_count'], bins=50, color='purple', alpha=0.7)
plt.title('Word Count Distribution in Summaries')
plt.xlabel('Word Count')
plt.ylabel('Number of Summaries')
plt.grid(axis='y')
plt.show()


In [None]:
# Calculating the number of summaries per prompt
summaries_per_prompt = summaries_train_df['prompt_id'].value_counts()

# Plotting the number of summaries per prompt
plt.figure(figsize=(10, 6))
summaries_per_prompt.plot(kind='bar', color='orange', alpha=0.7)
plt.title('Number of Summaries per Prompt')
plt.xlabel('Prompt ID')
plt.ylabel('Number of Summaries')
plt.grid(axis='y')
plt.show()


In [None]:
# Calculating word count for each prompt text
prompts_train_df['prompt_word_count'] = prompts_train_df['prompt_text'].apply(lambda x: len(str(x).split()))

# Plotting the distribution of word counts for prompts
plt.figure(figsize=(10, 6))
plt.bar(prompts_train_df['prompt_id'], prompts_train_df['prompt_word_count'], color='teal', alpha=0.7)
plt.title('Word Count Distribution in Prompt Texts')
plt.xlabel('Prompt ID')
plt.ylabel('Word Count')
plt.grid(axis='y')
plt.show()



Certainly! Here's a summarized overview of the Exploratory Data Analysis (EDA) we conducted:

Distribution of Content and Wording Scores:

Both distributions are roughly normal and centered around a score of approximately 2.5.
The dataset contains summaries of diverse quality, as indicated by the range of both low and high scores.
Word Count Distribution in Summaries:

Most summaries are concise, with word counts primarily ranging from 0 to 100.
A significant number of summaries have around 20-30 words, while longer summaries (beyond 100 words) are less common.
Number of Summaries per Prompt:

The dataset is balanced in terms of prompt representation, with each prompt having a comparable number of associated summaries.
Word Count Distribution in Prompt Texts:

Prompts vary in length, with word counts ranging from approximately 100 to over 300 words. This indicates variability in the complexity and length of the prompts provided to students.
In summary, the dataset is well-balanced in terms of prompt representation and contains summaries of varied quality and length. The prompts themselves also vary in complexity and length. This diversity is beneficial for building a robust model to predict content and wording scores for student summaries.







Based on the dataset overview and our EDA, here are some suggested data cleaning steps:

1. **Handling Missing Values**:
    - Check for missing values in the dataset.
    - Depending on the number and nature of missing values, decide whether to impute them, drop them, or replace them with placeholder values.

2. **Text Cleaning**:
    - **Lowercasing**: Convert all the text to lowercase to maintain consistency.
    - **Punctuation Removal**: Remove punctuation marks, as they might not add significant value for our modeling purposes.
    - **Stopwords Removal**: Eliminate common words that don't carry significant meaning, like "and", "the", "is", etc. (though this step might be optional based on the model we're using).
    - **Tokenization**: Break down the text into individual words or tokens.
    - **Lemmatization/Stemming**: Convert words to their base or root form. For example, "running" -> "run". This can help in reducing the dataset's dimensionality.

3. **Outliers Handling**:
    - Based on the distribution of content and wording scores, identify if there are any extreme outliers.
    - Decide on a strategy to handle these outliers – whether to cap them, transform them, or remove them entirely.

4. **Duplicate Removal**:
    - Ensure there aren't any duplicate entries in the dataset, especially in the summaries. If found, they should be removed to prevent over-representation.

5. **Standardizing Scores**:
    - If the scores (content and wording) have vastly different scales, consider standardizing or normalizing them.

6. **Encoding Categorical Data**:
    - If there are any categorical variables that need to be included in the model, consider encoding them (e.g., using one-hot encoding or label encoding).

7. **Handling Imbalanced Data**:
    - If the dataset is imbalanced with respect to some categories (e.g., a specific prompt having significantly fewer summaries), consider strategies to balance it, like oversampling, undersampling, or using synthetic data generation techniques.

8. **Text Length Consistency**:
    - Since the word count of summaries varies, consider setting a consistent length for model input (e.g., using padding for shorter texts or truncating longer ones).

9. **Spell Check and Correction**:
    - Since the summaries are written by students, there may be spelling mistakes. Depending on the objective, consider running a spell-check and correction.

Once the data is cleaned and preprocessed, it will be in a more suitable format for modeling and further analysis. Remember, the choice of specific cleaning steps can also depend on the model being used. For instance, models like BERT can handle stopwords and punctuation efficiently, so removing them might not be necessary.

2. Text Cleaning
For this step, we'll focus on the following tasks:

Convert all text to lowercase for consistency.
Remove punctuation.
Tokenize the text (split it into individual words).
Apply lemmatization to reduce words to their base form.
Let's start by cleaning the text column in the summaries_train_df dataframe.

In [None]:
'''import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet'''


In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the stopwords from the uploaded file
with open("/kaggle/input/nltk-english-stopwords/nltk_eng_stopwords.csv", "r") as f:
    stop_words = set(f.read().splitlines())

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    doc = nlp(text)
    tokens = [word.lemma_ for word in doc if word not in stop_words]
    return ' '.join(tokens)


In [None]:

# Applying the cleaning function to the summaries
summaries_train_df['cleaned_text'] = summaries_train_df['text'].apply(clean_text)
prompts_train_df['cleaned_prompt_question']=prompts_train_df['prompt_question'].apply(clean_text)
prompts_train_df['cleaned_prompt_text']=prompts_train_df['prompt_text'].apply(clean_text)
summaries_test_df['cleaned_text'] = summaries_test_df['text'].apply(clean_text)
prompts_test_df['cleaned_prompt_question']=prompts_test_df['prompt_question'].apply(clean_text)
prompts_test_df['cleaned_prompt_text']=prompts_test_df['prompt_text'].apply(clean_text)




Given that the values don't show extreme deviations, we may not have significant outliers. 

No dupicate record found. Now I will be ignoring other data cleaning steps except spell check.

# Feature Engineeering

In [None]:
from transformers import BertTokenizer

In [None]:
#!pip install transformers

from transformers import BertTokenizer

def truncate_string(input_string, max_length=510, model_name='/kaggle/input/huggingface-bert/bert-base-uncased/'):
    """
    Truncate the input string to fit within the model's maximum allowable tokens.

    Parameters:
    - input_string (str): The input string to be truncated.
    - max_length (int): Maximum number of tokens allowed. Default is 512.
    - model_name (str): The name of the pretrained model for the tokenizer. Default is 'bert-base-uncased'.

    Returns:
    - truncated_string (str): The truncated string.
    """
    
    # Initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Tokenize the string
    tokens = tokenizer.tokenize(input_string)

    # Check the length and truncate if necessary
    if len(tokens) > max_length:
        tokens = tokens[:max_length]

    # Convert the truncated token sequence back to a string
    truncated_string = tokenizer.convert_tokens_to_string(tokens)

    return truncated_string


In [None]:
import torch
from transformers import BertTokenizer, BertModel
BERT_MODEL_DIR = "/kaggle/input/huggingface-bert/bert-base-uncased/"

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_DIR)
model = BertModel.from_pretrained(BERT_MODEL_DIR)


def get_distilbert_embedding(text):
    text=truncate_string(text)
    # Tokenize the input text and obtain the output tensors
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        # Get the hidden states of the model
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings



In [None]:
import numpy as np
import torch

In [None]:
summaries_train_df['summary_embeddings'] = summaries_train_df['cleaned_text'] .apply(get_distilbert_embedding)
summaries_test_df['summary_embeddings'] = summaries_test_df['cleaned_text'] .apply(get_distilbert_embedding)


prompts_train_df['prompt']=prompts_train_df['cleaned_prompt_question']+prompts_train_df['cleaned_prompt_text']
prompts_train_df['prompt_embeddings'] = prompts_train_df['prompt'] .apply(get_distilbert_embedding)
#Lets merge summary and prompt dataframes
summaries_train_df=pd.merge(summaries_train_df,prompts_train_df,how="inner", on="prompt_id")


prompts_test_df['prompt']=prompts_test_df['cleaned_prompt_question']+prompts_test_df['cleaned_prompt_text']
prompts_test_df['prompt_embeddings'] = prompts_test_df['prompt'] .apply(get_distilbert_embedding)
#Lets merge summary and prompt dataframes
summaries_test_df=pd.merge(summaries_test_df,prompts_test_df,how="inner", on="prompt_id")


In [None]:
# Compute the cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    # Reshape the embeddings to 2D (samples, features) for cosine_similarity function
    embedding1 = np.reshape(embedding1, (1, -1))
    embedding2 = np.reshape(embedding2, (1, -1))
    return cosine_similarity(embedding1, embedding2)[0][0]

In [None]:
summaries_train_df['summary_size'] = summaries_train_df['cleaned_text'].apply(lambda x: len(str(x).split()))
summaries_train_df['prompt_size'] = summaries_train_df['prompt'].apply(lambda x: len(str(x).split()))


In [None]:
summaries_test_df['summary_size'] = summaries_test_df['cleaned_text'].apply(lambda x: len(str(x).split()))
summaries_test_df['prompt_size'] = summaries_test_df['prompt'].apply(lambda x: len(str(x).split()))


In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
summaries_train_df['cosine_similarity'] = summaries_train_df.apply(lambda row: compute_cosine_similarity(row['summary_embeddings'], row['prompt_embeddings']),
    axis=1
)


summaries_test_df['cosine_similarity'] = summaries_test_df.apply(lambda row: compute_cosine_similarity(row['summary_embeddings'], row['prompt_embeddings']),
    axis=1
)


In [None]:
from sklearn.model_selection import train_test_split


# Split the train embedding data into a training set (80%) and a validation set (20%)
train_data, validation_data = train_test_split(summaries_train_df, test_size=0.2, random_state=42)

train_data.shape, validation_data.shape


In [None]:
train_data=train_data.reset_index()
validation_data=validation_data.reset_index()

### Model Developement

In [None]:
train_data["summary_embeddings"][0]

In [None]:
np.stack(train_data["summary_embeddings"].values)

In [None]:
train_data['prompt_embeddings'][0]

In [None]:
np.squeeze(np.stack(train_data["summary_embeddings"].values))

In [None]:
np.stack(train_data["summary_embeddings"].values)

In [None]:
train_data["summary_embeddings"].values

In [None]:
np.stack(train_data["prompt_size"].values)

In [None]:
X_train = [
    np.stack(train_data["summary_embeddings"].values),
    np.stack(train_data["prompt_embeddings"].values),
    np.stack(train_data["summary_size"].values),
    np.stack(train_data["prompt_size"].values),
    np.stack(train_data["cosine_similarity"].values)
]

In [None]:
len(train_data["content"].values)

In [None]:
'''import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate

# Compute the cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    # Reshape the embeddings to 2D (samples, features) for cosine_similarity function
    embedding1 = np.reshape(embedding1, (1, -1))
    embedding2 = np.reshape(embedding2, (1, -1))
    return cosine_similarity(embedding1, embedding2)[0][0]

# Modified Model definition
def build_model(embedding_size):
    # Input layers for summary and prompt embeddings
    summary_input = Input(shape=(embedding_size,), name="summary_embedding")
    prompt_input = Input(shape=(embedding_size,), name="prompt_embedding")
    summary_size_input = Input(shape=(1,), name="summary_size")
    prompt_size_input = Input(shape=(1,), name="prompt_size")
    cosine_similarity_input = Input(shape=(1,), name="cosine_similarity")

    # Concatenate the embeddings and the new inputs
    merged_input = Concatenate()([summary_input, prompt_input, summary_size_input, prompt_size_input, cosine_similarity_input])

    x = Dense(512, activation='relu')(merged_input)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)

    # Output layers for content and wording scores
    content_output = Dense(1, name="content")(x)
    wording_output = Dense(1, name="wording")(x)

    # Compile the model
    model = Model(inputs=[summary_input, prompt_input, summary_size_input, prompt_size_input, cosine_similarity_input], 
                  outputs=[content_output, wording_output])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    return model
embedding_size = train_data["summary_embeddings"].iloc[0].shape[0]

model = build_model(embedding_size)

# Prepare Training Data
X_train = [
    np.stack(train_data["summary_embeddings"].values),
    np.stack(train_data["prompt_embeddings"].values),
    np.stack(train_data["summary_size"].values),
    np.stack(train_data["prompt_size"].values),
    np.stack(train_data["cosine_similarity"].values)
]
Y_train = [train_data["content"].values, train_data["wording"].values]

# Prepare Validation Data
X_val = [
    np.stack(validation_data["summary_embeddings"].values),
    np.stack(validation_data["prompt_embeddings"].values),
    np.stack(validation_data["summary_size"].values),
   np.stack(validation_data["prompt_size"].values),
   np.stack(validation_data["cosine_similarity"].values)
]
Y_val = [validation_data["content"].values, validation_data["wording"].values]

# Train the model
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=50, batch_size=32, verbose=1)'''


In [None]:
from tensorflow.keras.layers import BatchNormalization


In [None]:
from tensorflow.keras.layers import Activation


In [None]:
X_train = [
    np.stack(train_data["summary_embeddings"].values),
    np.stack(train_data["prompt_embeddings"].values),
    train_data["summary_size"].values,
    train_data["prompt_size"].values,
    train_data["cosine_similarity"].values
]

In [None]:
len(train_data["summary_embeddings"][0])

In [None]:
train_data["summary_embeddings"].values

In [None]:
train_data["summary_size"].values

In [None]:
train_data["summary_embeddings"]

In [None]:

# Prepare data for training
X_train = [
    np.stack(train_data["summary_embeddings"].values),
    np.stack(train_data["prompt_embeddings"].values),
    train_data["summary_size"].values,
    train_data["prompt_size"].values,
    train_data["cosine_similarity"].values
]

Y_train = [train_data["content"].values, train_data["wording"].values]

X_val = [
    np.stack(validation_data["summary_embeddings"].values),
    np.stack(validation_data["prompt_embeddings"].values),
    validation_data["summary_size"].values,
    validation_data["prompt_size"].values,
    validation_data["cosine_similarity"].values
]

Y_val = [validation_data["content"].values, validation_data["wording"].values]

#### HyperParameterTuning

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate

In [None]:
from kerastuner import RandomSearch

def build_model(hp):
    # Input layers
    summary_input = Input(shape=(768,), name="summary_embedding")
    prompt_input = Input(shape=(768,), name="prompt_embedding")
    
    # Process embeddings separately
    merged_embeddings = Concatenate()([summary_input, prompt_input])
    
    # Define the hyperparameter search space for the number of units in the first Dense layer
    units_1 = hp.Int('units_1', min_value=512, max_value=2560, step=256)
    
    x = Dense(units_1, activation='relu')(merged_embeddings)
    
    # Dropout hyperparameter
    dropout_rate = hp.Float('dropout_rate', min_value=0.0, max_value=0.5, step=0.1)
    embeddings_processed = Dropout(dropout_rate)(x)
    
    # Subsequent layers with halved neurons
    units_2 = units_1 // 2
    x = Dense(units_2, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
	
	# For scalar inputs, we'll use BatchNormalization followed by a sigmoid activation to ensure values are in [0, 1]
    summary_size_input = Input(shape=(1,), name="summary_size")
    normalized_summary_size = BatchNormalization()(summary_size_input)
    scaled_summary_size = Activation('sigmoid')(normalized_summary_size)

    prompt_size_input = Input(shape=(1,), name="prompt_size")
    normalized_prompt_size = BatchNormalization()(prompt_size_input)
    scaled_prompt_size = Activation('sigmoid')(normalized_prompt_size)

    cosine_similarity_input = Input(shape=(1,), name="cosine_similarity")
   

    # Concatenate the processed embeddings and the scaled new inputs
    merged_input = Concatenate()([embeddings_processed, scaled_summary_size, scaled_prompt_size, cosine_similarity_input])
    
    # Further processing (considering the dimensionality)
    
    units_3 = units_2 // 2
    x = Dense(units_3, activation='relu')(merged_input)
    x = Dropout(dropout_rate)(x)
    
    units_4 = units_3 // 2
    x = Dense(units_4, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
  
    # Output layers for content and wording scores
    content_output = Dense(1, name="content")(x)
    wording_output = Dense(1, name="wording")(x)
    
    # Compile the model
    model = Model(inputs=[summary_input, prompt_input, summary_size_input, prompt_size_input, cosine_similarity_input], 
                  outputs=[content_output, wording_output])

    # Optimizer hyperparameter
    optimizer = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    
    model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])
    return model

# Create a tuner
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=20,  # Number of hyperparameter combinations to try
    directory='keras_tuner_directory',
    project_name='keras_tuner_demo'
)

# Display search space summary
tuner.search_space_summary()

# Start hyperparameter search
tuner.search(X_train, Y_train, validation_data=(X_val, Y_val), epochs=50, batch_size=32)

# Display results
tuner.results_summary()

# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]


### Get the best hyperparamters

In [None]:
# Get the best hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# Display the best hyperparameters
print("Best Hyperparameters:")
print(best_hyperparameters.values)

In [None]:
lv_units_1=best_hyperparameters['units_1']
lv_dropout_rate=best_hyperparameters['dropout_rate']
lv_optimizer=best_hyperparameters['optimizer']


In [None]:
from kerastuner import RandomSearch

def model_instance(lv_units_1,lv_dropout_rate,lv_optimizer):
    # Input layers
    summary_input = Input(shape=(768,), name="summary_embedding")
    prompt_input = Input(shape=(768,), name="prompt_embedding")
    
    # Process embeddings separately
    merged_embeddings = Concatenate()([summary_input, prompt_input])
    
   
    x = Dense(lv_units_1, activation='relu')(merged_embeddings)
    
   
    embeddings_processed = Dropout(lv_dropout_rate)(x)
    
    # Subsequent layers with halved neurons
    units_2 = lv_units_1 // 2
    x = Dense(units_2, activation='relu')(x)
    x = Dropout(lv_dropout_rate)(x)
	
	# For scalar inputs, we'll use BatchNormalization followed by a sigmoid activation to ensure values are in [0, 1]
    summary_size_input = Input(shape=(1,), name="summary_size")
    normalized_summary_size = BatchNormalization()(summary_size_input)
    scaled_summary_size = Activation('sigmoid')(normalized_summary_size)

    prompt_size_input = Input(shape=(1,), name="prompt_size")
    normalized_prompt_size = BatchNormalization()(prompt_size_input)
    scaled_prompt_size = Activation('sigmoid')(normalized_prompt_size)

    cosine_similarity_input = Input(shape=(1,), name="cosine_similarity")
   

    # Concatenate the processed embeddings and the scaled new inputs
    merged_input = Concatenate()([embeddings_processed, scaled_summary_size, scaled_prompt_size, cosine_similarity_input])
    
    # Further processing (considering the dimensionality)
    
    units_3 = units_2 // 2
    x = Dense(units_3, activation='relu')(merged_input)
    x = Dropout(lv_dropout_rate)(x)
    
    units_4 = units_3 // 2
    x = Dense(units_4, activation='relu')(x)
    x = Dropout(lv_dropout_rate)(x)
  
    # Output layers for content and wording scores
    content_output = Dense(1, name="content")(x)
    wording_output = Dense(1, name="wording")(x)
    
    # Compile the model
    model = Model(inputs=[summary_input, prompt_input, summary_size_input, prompt_size_input, cosine_similarity_input], 
                  outputs=[content_output, wording_output])

   
    model.compile(optimizer=lv_optimizer, loss='mse', metrics=['mse'])
    return model


In [None]:
lv_model_instance = model_instance(lv_units_1,lv_dropout_rate,lv_optimizer)

# Train the model
history = lv_model_instance.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=50, batch_size=32, verbose=1)


#### Neural Network 

In [None]:
# Return the final loss and mean absolute error for the validation set
final_mae_content = history.history['val_content_mse'][-1]
final_mae_wording = history.history['val_wording_mse'][-1]

final_mae_content, final_mae_wording

In [None]:
# Assuming the summaries_test_df dataframe has a column named 'student_id'
student_ids = summaries_test_df["student_id"].values


X_test = [
    np.stack(summaries_test_df["summary_embeddings"].values),
    np.stack(summaries_test_df["prompt_embeddings"].values),
    np.stack(summaries_test_df["summary_size"].values),
    np.stack(summaries_test_df["prompt_size"].values),
    np.stack(summaries_test_df["cosine_similarity"].values)
]
content_preds, wording_preds = lv_model_instance.predict(X_test)

# Format and save predictions
submission_df = pd.DataFrame({
    'student_id': student_ids,
    'content': content_preds.flatten(),
    'wording': wording_preds.flatten()
})
submission_df.to_csv("submission.csv", index=False)
