<a href="https://colab.research.google.com/github/satbit007/Machine-Learning-Driven-Approach-to-Automated-Essay-Scoring-AES-/blob/main/IML_FinalProject_SatyakiChoudhury.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Necessary Libraries

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


Load and Preprocess the Data

In [13]:
# Load training data
train_df = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

# Load validation data
valid_df = pd.read_csv('/content/valid_set.tsv', sep='\t', encoding='ISO-8859-1')

# Load training data
train_df = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

# After loading the training data
print("Column names in training data:", train_df.columns)

# After loading the validation data
print("Column names in validation data:", valid_df.columns)

# Display the first few rows of the training data
print(train_df.head())

# Display the first few rows of the validation data
print(valid_df.head())

# Basic preprocessing
import re
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

train_df['essay'] = train_df['essay'].apply(preprocess_text)
valid_df['essay'] = valid_df['essay'].apply(preprocess_text)

Column names in training data: Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6'],
      dtype='object')
Column names in validation data: Index(['essay_id', 'essay_set', 'essay', 'domain1_predictionid',
       'domain2_predictionid'],
      dtype='object')
   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Feature Extraction

In [19]:
train_df = train_df.dropna(subset=['essay', 'domain1_score'])

# Reset the index of your DataFrame after preprocessing and before TF-IDF vectorization
train_df = train_df.reset_index(drop=True)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_tfidf = tfidf.fit_transform(train_df['essay'])
X_valid_tfidf = tfidf.transform(valid_df['essay'])

# Continue with TF-IDF vectorization
X_train_tfidf = tfidf.fit_transform(train_df['essay'])

# Prepare target variable for training data
y_train = train_df['domain1_score']

# Right after creating X_train_tfidf and y_train
print("Number of rows in the feature set:", X_train_tfidf.shape[0])
print("Number of rows in the target variable:", y_train.shape[0])


# If you decide to use a part of the training data for validation
#X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)


Number of rows in the feature set: 12976
Number of rows in the target variable: 12976


Model Training - RandomForestRegressor

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Assuming 'domain1_score' as the target for training data
y_train = train_df['domain1_score']

# Split the training data into new training and validation subsets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train the RandomForestRegressor model on the new training subset
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train_split, y_train_split)

Model Validation - RandomForestRegressor

In [25]:
# Predict and evaluate the model on the validation subset
y_pred_val = model.predict(X_val_split)
mse = mean_squared_error(y_val_split, y_pred_val)
r2 = r2_score(y_val_split, y_pred_val)

# Print the evaluation metrics for the validation subset
print(f'Mean Squared Error on validation set: {mse}')
print(f'R^2 Score on validation set: {r2}')


Mean Squared Error on validation set: 6.4674604455572675
R^2 Score on validation set: 0.9169324892912425


Updated feature extraction with Bag-of-words and Syntactic parsing.

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from collections import Counter
from scipy.sparse import hstack
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
# Load training data
train_df = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

# Preprocess text function
def preprocess_text(text):
    # Lowercasing and removing punctuations
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to the essays
train_df['essay'] = train_df['essay'].apply(preprocess_text)

# Drop rows with missing values in 'essay' or 'domain1_score'
train_df = train_df.dropna(subset=['essay', 'domain1_score'])

# Reset the index of the DataFrame after preprocessing
train_df = train_df.reset_index(drop=True)


In [31]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['essay'])

# Bag-of-Words Vectorization
bow_vectorizer = CountVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(train_df['essay'])

# Function to extract syntactic features (POS tag frequencies)
def syntactic_features(text):
    # Tokenize the text and perform POS tagging
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    # Count frequencies of each POS tag
    counts = Counter(tag for word, tag in pos_tags)
    return counts

# Apply syntactic feature extraction
train_df['syntactic_features'] = train_df['essay'].apply(syntactic_features)

# Convert syntactic feature counts to DataFrame
syntactic_df = pd.DataFrame.from_records(train_df['syntactic_features']).fillna(0)
syntactic_df = syntactic_df.div(syntactic_df.sum(axis=1), axis=0)  # Normalize (optional)

# Combine Bag-of-Words, TF-IDF, and Syntactic features
X_train_combined = hstack([X_train_bow, X_train_tfidf, syntactic_df])


In [32]:
# Prepare target variable for training data
y_train = train_df['domain1_score']

# Split the combined feature set into training and validation subsets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_combined, y_train, test_size=0.2, random_state=42)

# Train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train_split, y_train_split)


In [33]:
# Predict and evaluate the model on the validation subset
y_pred_val = model.predict(X_val_split)
mse = mean_squared_error(y_val_split, y_pred_val)
r2 = r2_score(y_val_split, y_pred_val)

# Print the evaluation metrics for the validation subset
print(f'Mean Squared Error on validation set: {mse}')
print(f'R^2 Score on validation set: {r2}')


Mean Squared Error on validation set: 5.833537182470896
R^2 Score on validation set: 0.9250745456498757


Model changed to Ridge Regression

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from collections import Counter
from scipy.sparse import hstack
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
# Load training data
train_df = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

# Preprocess text function
def preprocess_text(text):
    # Lowercasing and removing punctuations
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to the essays
train_df['essay'] = train_df['essay'].apply(preprocess_text)

# Drop rows with missing values in 'essay' or 'domain1_score'
train_df = train_df.dropna(subset=['essay', 'domain1_score'])

# Reset the index of the DataFrame after preprocessing
train_df = train_df.reset_index(drop=True)


In [36]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['essay'])

# Bag-of-Words Vectorization
bow_vectorizer = CountVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(train_df['essay'])

# Function to extract syntactic features (POS tag frequencies)
def syntactic_features(text):
    # Tokenize the text and perform POS tagging
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    # Count frequencies of each POS tag
    counts = Counter(tag for word, tag in pos_tags)
    return counts

# Apply syntactic feature extraction
train_df['syntactic_features'] = train_df['essay'].apply(syntactic_features)

# Convert syntactic feature counts to DataFrame
syntactic_df = pd.DataFrame.from_records(train_df['syntactic_features']).fillna(0)
syntactic_df = syntactic_df.div(syntactic_df.sum(axis=1), axis=0)  # Normalize (optional)

# Combine TF-IDF, Bag-of-Words, and Syntactic features
X_train_combined = hstack([X_train_tfidf, X_train_bow, syntactic_df])


In [37]:
# Prepare target variable for training data
y_train = train_df['domain1_score']

# Split the combined feature set into training and validation subsets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_combined, y_train, test_size=0.2, random_state=42)

# Train the Ridge Regression model
model = Ridge(alpha=1.0)
model.fit(X_train_split, y_train_split)


In [38]:
# Predict and evaluate the model on the validation subset
y_pred_val = model.predict(X_val_split)
mse = mean_squared_error(y_val_split, y_pred_val)
r2 = r2_score(y_val_split, y_pred_val)

# Print the evaluation metrics for the validation subset
print(f'Mean Squared Error on validation set: {mse}')
print(f'R^2 Score on validation set: {r2}')


Mean Squared Error on validation set: 15.455384644464596
R^2 Score on validation set: 0.8014923569661805


Model changed to SVM

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from collections import Counter
from scipy.sparse import hstack
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
# Load training data
train_df = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

# Preprocess text function
def preprocess_text(text):
    # Lowercasing and removing punctuations
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to the essays
train_df['essay'] = train_df['essay'].apply(preprocess_text)

# Drop rows with missing values in 'essay' or 'domain1_score'
train_df = train_df.dropna(subset=['essay', 'domain1_score'])

# Reset the index of the DataFrame after preprocessing
train_df = train_df.reset_index(drop=True)


In [42]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['essay'])

# Bag-of-Words Vectorization
bow_vectorizer = CountVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(train_df['essay'])

# Function to extract syntactic features (POS tag frequencies)
def syntactic_features(text):
    # Tokenize the text and perform POS tagging
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    # Count frequencies of each POS tag
    counts = Counter(tag for word, tag in pos_tags)
    return counts

# Apply syntactic feature extraction
train_df['syntactic_features'] = train_df['essay'].apply(syntactic_features)

# Convert syntactic feature counts to DataFrame
syntactic_df = pd.DataFrame.from_records(train_df['syntactic_features']).fillna(0)
syntactic_df = syntactic_df.div(syntactic_df.sum(axis=1), axis=0)  # Normalize (optional)

# Combine TF-IDF, Bag-of-Words, and Syntactic features
X_train_combined = hstack([X_train_tfidf, X_train_bow, syntactic_df])


In [43]:
# Prepare target variable for training data
y_train = train_df['domain1_score']

# Split the combined feature set into training and validation subsets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_combined, y_train, test_size=0.2, random_state=42)

# Train the SVM model for regression
model = SVR(C=1.0, epsilon=0.2)
model.fit(X_train_split, y_train_split)


In [44]:
# Predict and evaluate the model on the validation subset
y_pred_val = model.predict(X_val_split)
mse = mean_squared_error(y_val_split, y_pred_val)
r2 = r2_score(y_val_split, y_pred_val)

# Print the evaluation metrics for the validation subset
print(f'Mean Squared Error on validation set: {mse}')
print(f'R^2 Score on validation set: {r2}')


Mean Squared Error on validation set: 36.3152770811998
R^2 Score on validation set: 0.5335696765016499


Ensemble technique - Combining the three models (SVM, Ridge Regression, and RandomForestRegressor) for maximum efficiency.

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from collections import Counter
from scipy.sparse import hstack
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
# Load training data
train_df = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

# Preprocess text function
def preprocess_text(text):
    # Lowercasing and removing punctuations
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to the essays
train_df['essay'] = train_df['essay'].apply(preprocess_text)

# Drop rows with missing values in 'essay' or 'domain1_score'
train_df = train_df.dropna(subset=['essay', 'domain1_score'])

# Reset the index of the DataFrame after preprocessing
train_df = train_df.reset_index(drop=True)


In [48]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['essay'])

# Bag-of-Words Vectorization
bow_vectorizer = CountVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(train_df['essay'])

# Function to extract syntactic features (POS tag frequencies)
def syntactic_features(text):
    # Tokenize the text and perform POS tagging
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    # Count frequencies of each POS tag
    counts = Counter(tag for word, tag in pos_tags)
    return counts

# Apply syntactic feature extraction
train_df['syntactic_features'] = train_df['essay'].apply(syntactic_features)

# Convert syntactic feature counts to DataFrame
syntactic_df = pd.DataFrame.from_records(train_df['syntactic_features']).fillna(0)
syntactic_df = syntactic_df.div(syntactic_df.sum(axis=1), axis=0)  # Normalize (optional)

# Combine TF-IDF, Bag-of-Words, and Syntactic features
X_train_combined = hstack([X_train_tfidf, X_train_bow, syntactic_df])


In [49]:
# Prepare target variable for training data
y_train = train_df['domain1_score']

# Split data into training and validation subsets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Initialize models
model_rf = RandomForestRegressor(n_estimators=100)
model_ridge = Ridge(alpha=1.0)
model_svr = SVR(C=1.0, epsilon=0.2)

# Train models
model_rf.fit(X_train_split, y_train_split)
model_ridge.fit(X_train_split, y_train_split)
model_svr.fit(X_train_split, y_train_split)


In [50]:
# Make predictions with each model
preds_rf = model_rf.predict(X_val_split)
preds_ridge = model_ridge.predict(X_val_split)
preds_svr = model_svr.predict(X_val_split)

# Average predictions
ensemble_preds = (preds_rf + preds_ridge + preds_svr) / 3

# Evaluate the ensemble
mse = mean_squared_error(y_val_split, ensemble_preds)
r2 = r2_score(y_val_split, ensemble_preds)

# Print evaluation metrics
print(f'Mean Squared Error (Ensemble) on validation set: {mse}')
print(f'R^2 Score (Ensemble) on validation set: {r2}')


Mean Squared Error (Ensemble) on validation set: 7.328811946019666
R^2 Score (Ensemble) on validation set: 0.9058693640365971


spaCy has been used for advanced NLP processing.

In [5]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.sparse import hstack
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Load training data
train_df = pd.read_csv('/content/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

# Define the preprocess_text function
def preprocess_text(text):
    # Lowercasing and removing punctuations
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to the essays
train_df['essay'] = train_df['essay'].apply(preprocess_text)

# Drop rows with missing values in 'essay' or 'domain1_score'
train_df = train_df.dropna(subset=['essay', 'domain1_score'])

# Reset the index of the DataFrame after preprocessing
train_df = train_df.reset_index(drop=True)


In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['essay'])

bow_vectorizer = CountVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(train_df['essay'])

def syntactic_features(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    counts = Counter(tag for word, tag in pos_tags)
    return counts

train_df['syntactic_features'] = train_df['essay'].apply(syntactic_features)
syntactic_df = pd.DataFrame.from_records(train_df['syntactic_features']).fillna(0)
syntactic_df = syntactic_df.div(syntactic_df.sum(axis=1), axis=0)  # Normalize (optional)

# Load spacy model for advanced NLP processing
nlp = spacy.load('en_core_web_sm')

def advanced_nlp_features(text):
    doc = nlp(text)
    grammatical_errors = sum(1 for token in doc if token.tag_ in ['WRONG_TAG1', 'WRONG_TAG2'])
    # Additional features based on insights can be added here
    return {'grammatical_errors': grammatical_errors}

train_df['advanced_nlp_features'] = train_df['essay'].apply(advanced_nlp_features)
advanced_nlp_df = pd.DataFrame.from_records(train_df['advanced_nlp_features'])

X_train_combined = hstack([X_train_tfidf, X_train_bow, syntactic_df, advanced_nlp_df])

In [8]:
y_train = train_df['domain1_score']
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_combined, y_train, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train_split, y_train_split)


In [9]:
y_pred_val = model.predict(X_val_split)
mse = mean_squared_error(y_val_split, y_pred_val)
r2 = r2_score(y_val_split, y_pred_val)

print(f'Mean Squared Error on validation set: {mse}')
print(f'R^2 Score on validation set: {r2}')


Mean Squared Error on validation set: 5.836116231852422
R^2 Score on validation set: 0.9250414205594446
