In [36]:
pip install pandas numpy tensorflow




In [37]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [38]:
# Load the CSV file
data = pd.read_csv('html data (2).csv')

# Concatenate expertise columns from 1 to 10
expertise_columns = [f'Expertise{i}' for i in range(1, 11)]
expertise = data[expertise_columns].apply(lambda row: ' '.join(row.dropna()), axis=1).values

# Separate scientist name column
scientist_names = data['Name'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(expertise)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(expertise)

# Pad sequences to have the same length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)

# Convert scientist names to numerical labels
label_to_id = {name: i for i, name in enumerate(set(scientist_names))}
id_to_label = {i: name for name, i in label_to_id.items()}
labels = [label_to_id[name] for name in scientist_names]

# Convert labels to one-hot encoding
one_hot_labels = tf.keras.utils.to_categorical(labels)

# Split the data into training and testing sets
split_ratio = 0.8
split_index = int(split_ratio * len(padded_sequences))
train_sequences = padded_sequences[:split_index]
train_labels = one_hot_labels[:split_index]
test_sequences = padded_sequences[split_index:]
test_labels = one_hot_labels[split_index:]

# Define the CNN model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_seq_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_to_id), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_sequences, train_labels, epochs=50, batch_size=32)

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(test_sequences, test_labels)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

# Predict the scientist names for new expertise data
new_expertise = input("Enter Expertise: ")

# Iterate over each expertise column and print only the names for the given expertise
all_predicted_names = set()

for col in expertise_columns:
    data_copy = data.copy()
    data_copy.loc[data_copy[col] != new_expertise, col] = np.nan
    new_expertise_concatenated = data_copy[expertise_columns].apply(lambda row: ' '.join(row.dropna()), axis=1).values
    new_sequences = tokenizer.texts_to_sequences(new_expertise_concatenated)
    new_padded_sequences = pad_sequences(new_sequences, maxlen=max_seq_length)
    predictions = model.predict(new_padded_sequences)

    # Filter out scientists who do not have the given expertise
    relevant_indices = np.where(data_copy[col] == new_expertise)[0]
    
    if len(relevant_indices) > 0:
        relevant_predictions = predictions[relevant_indices]
        # Append the predicted scientist names for the given expertise
        predicted_labels = np.argmax(relevant_predictions, axis=1)
        predicted_names = [id_to_label[label] for label in predicted_labels]
        all_predicted_names.update(predicted_names)

# Print only the names for the given expertise
print(f'Expertise: {new_expertise}  -->  Scientist Names: {list(all_predicted_names)}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 15.7453
Test accuracy: 0.0000
Enter Expertise: Air Pollution
Expertise: Air Pollution  -->  Scientist Names: ['S. Balaji', 'K. Krishnamurthi', 'N. K. Labhasetwar', 'T.V.B.P.S. RamaKrishna', 'B. P. S. Rao', 'P. V. Nidhesh', 'S. Kaur', 'R. Sivacoumar', 'D. D. Majumdar', 'Papiya Mandal', 'A. Middey', 'H. Bherwani', 'S. D. Sontakke', 'S. Kamble', 'S. K. Goyal', 'A. Lalwani', 'A. Gupta', 'S. Pramanik