In [14]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset into a Pandas DataFrame
df = pd.read_csv('Q1.csv')

# Preprocess the data
df['Answer'] = df['Answer'].str.lower()
df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')
df = df.dropna()

# Tokenize and encode the text data
max_words = 10000
max_length = 128

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Answer'])
X = tokenizer.texts_to_sequences(df['Answer'])
X = pad_sequences(X, maxlen=max_length)
y = df['Ratings'].values.reshape(-1, 1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network model (CNN)
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Train the neural network model
model.fit(X_train, y_train, validation_split=0.2, epochs=10)

Question = df['Question'].iloc[0]
print(f"Associated Question: {Question}\n")

input_text = "Taj Mahal is constructed in Agra. It is made up of white marble. It was built by Shah Jahan in memory of Mumtaz Mahal"
input_text = input_text.lower()
input_text = re.sub('[^\w\s]', '', input_text)
input_encoded = tokenizer.texts_to_sequences([input_text])
input_padded = pad_sequences(input_encoded, maxlen=max_length)

# Calculate the predicted rating
predicted_rating_ = model.predict(input_padded.reshape(1, max_length))[0][0]

# Calculate cosine similarity
def calculate_cosine_similarity(user_input, dataset_answers):
    similarities = cosine_similarity(user_input, dataset_answers)
    return similarities

# Find similar answers in the dataset
user_input_embedding = model.predict(input_padded)
dataset_embeddings = model.predict(X)
similarities = calculate_cosine_similarity(user_input_embedding, dataset_embeddings)

# Number of similar answers to consider
top_n = 5

# Get the indices of the top-N similar answers
similar_indices = similarities.argsort(axis=1)[:, -top_n:][:, ::-1]

# Extract ratings of similar answers
similar_ratings = df.iloc[similar_indices.ravel()]['Ratings'].values.reshape(-1, top_n)

# Calculate the weighted average of similar ratings based on cosine similarity
weighted_ratings = np.sum(similar_ratings * similarities[:, -top_n:], axis=1) / np.sum(similarities[:, -top_n:], axis=1)
predicted_rating = weighted_ratings

print(f"Predicted Rating (Weighted Average): {predicted_rating_:.2f}")

# Evaluate the model
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Calculate R-squared (R^2)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R^2): {r2:.4f}")

  df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Associated Question: Where was Taj Mahal built and what is it made of? And who built the monument in the memory of whom?

Predicted Rating (Weighted Average): 9.20
Mean Squared Error (MSE): 0.0021
Mean Absolute Error (MAE): 0.0376
R-squared (R^2): 0.9997


In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Calculate R-squared (R^2)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R^2): {r2:.4f}")

Mean Squared Error (MSE): 0.0021
Mean Absolute Error (MAE): 0.0376
R-squared (R^2): 0.9997


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Flatten
from keras.callbacks import EarlyStopping
import numpy as np
import re
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset into a Pandas DataFrame
csv_files = ['Q1.csv', 'Q2.csv', 'Q3.csv', 'Q4.csv', 'Q5.csv', 'Q6.csv']

# Create an empty list to store predicted ratings
predicted_ratings = []

for csv_file in csv_files:
    df = pd.read_csv(csv_file)

    # Preprocess the data
    df['Answer'] = df['Answer'].str.lower()
    df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')
    df = df.dropna()

    # Tokenize and encode the text data
    max_words = 10000
    max_length = 128

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(df['Answer'])
    X = tokenizer.texts_to_sequences(df['Answer'])
    X = pad_sequences(X, maxlen=max_length)

    # Load the trained model
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['mse'])

    # Train the neural network model
    X_train, X_test, y_train, y_test = train_test_split(X, df['Ratings'].values.reshape(-1, 1), test_size=0.2, random_state=42)
    early_stopping = EarlyStopping(monitor='val_loss', patience=2)
    model.fit(X_train, y_train, validation_split=0.2, epochs=10, callbacks=[early_stopping])

    # Print the associated question
    associated_question = df['Question'].iloc[0]
    print(f"Associated Question: {associated_question}\n")

    # Ask for user input
    input_text = input(f"Enter answer for {csv_file}: ")
    input_text = input_text.lower()
    input_text = re.sub('[^\w\s]', '', input_text)
    input_encoded = tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_encoded, maxlen=max_length)

    # Use the trained model to predict the rating
    predicted_rating_ = model.predict(input_padded)[0][0]

    # Calculate cosine similarity
    def calculate_cosine_similarity(user_input, dataset_answers):
        similarities = cosine_similarity(user_input, dataset_answers)
        return similarities

    # Find similar answers in the dataset
    user_input_embedding = model.predict(input_padded)
    dataset_embeddings = model.predict(X)
    similarities = calculate_cosine_similarity(user_input_embedding, dataset_embeddings)

    # Number of similar answers to consider
    top_n = 5

    # Get the indices of the top-N similar answers
    similar_indices = similarities.argsort(axis=1)[:, -top_n:][:, ::-1]

    # Extract ratings of similar answers
    similar_ratings = df.iloc[similar_indices.ravel()]['Ratings'].values.reshape(-1, top_n)

    # Calculate the weighted average of similar ratings based on cosine similarity
    weighted_ratings = np.sum(similar_ratings * similarities[:, -top_n:], axis=1) / np.sum(similarities[:, -top_n:], axis=1)
    predicted_rating = weighted_ratings

    predicted_ratings.append(predicted_rating_)

    print(f"Predicted Rating for {csv_file}: {predicted_rating_:.2f}")

    # Evaluate the model
    y_pred = model.predict(X_test)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error (MSE): {mse:.4f}")

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Mean Absolute Error (MAE): {mae:.4f}")

    # Calculate R-squared (R^2)
    r2 = r2_score(y_test, y_pred)
    print(f"R-squared (R^2): {r2:.4f}")

# Calculate the combined rating
combined_rating = sum(predicted_ratings)
print(f"Combined Rating for all files: {combined_rating:.2f}")

Epoch 1/10


  df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Associated Question: Where was Taj Mahal built and what is it made of? And who built the monument in the memory of whom?

Enter answer for Q1.csv: Taj Mahal was built in Delhi. It is made up of Black marble, It was built by Shah Jahan in the memory of Mumtaz Mahal
Predicted Rating for Q1.csv: 6.40
Mean Squared Error (MSE): 0.0015
Mean Absolute Error (MAE): 0.0275
R-squared (R^2): 0.9998


  df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Associated Question: What are some basic facts about France (with regard to its location, capital, currency and national language) ?

Enter answer for Q2.csv: France is located in Europe, its capital is Bern. Its currency is Rupee and French is its national language
Predicted Rating for Q2.csv: 4.28
Mean Squared Error (MSE): 0.0008
Mean Absolute Error (MAE): 0.0219
R-squared (R^2): 0.9999
Epoch 1/10


  df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Associated Question: What is the full form of AI, ML and DL ?

Enter answer for Q3.csv: AI is artificial intelligence, ML is money learning and DL is learning in the deep
Predicted Rating for Q3.csv: 6.46
Mean Squared Error (MSE): 0.0289
Mean Absolute Error (MAE): 0.1208
R-squared (R^2): 0.9962
Epoch 1/10


  df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Associated Question: Name first four planets in the solar system

Enter answer for Q4.csv: First four planets in the solar system are Mercury, Mars, Venus and Earth
Predicted Rating for Q4.csv: 6.49
Mean Squared Error (MSE): 0.0083
Mean Absolute Error (MAE): 0.0741
R-squared (R^2): 0.9989
Epoch 1/10


  df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Associated Question: What does CEO stand for?

Enter answer for Q5.csv: CEO stands for executive chief officer
Predicted Rating for Q5.csv: 6.01
Mean Squared Error (MSE): 0.4958
Mean Absolute Error (MAE): 0.2929
R-squared (R^2): 0.9318
Epoch 1/10


  df['Answer'] = df['Answer'].str.replace('[^\w\s]', '')


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Associated Question: Who is the father of the nation? And where was he born (both city and state)? 

Enter answer for Q6.csv: Mahatma Gandhi is the father of nation and hes born in Gujarat Porbandar
Predicted Rating for Q6.csv: 5.31
Mean Squared Error (MSE): 1.0206
Mean Absolute Error (MAE): 0.8181
R-squared (R^2): 0.8368
Combined Rating for all files: 34.96
