In [145]:
# Import necessary libraries
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import nltk
from nltk import pos_tag 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import joblib
# Download NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def load_data(file_path):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    return df

def split_data(feature,labels):
    # Split the data into training and testing sets
    sentences = feature
    label = labels
    cleaned_text = preprocess_text(sentences)
    
    clean_converted = convert_input(cleaned_text)
    
    x_train, x_test, y_train, y_test = train_test_split(clean_converted, label, test_size=0.2, random_state=42)
    return x_train, x_test, y_train, y_test, tokenizer

def preprocess_text(sentences):
    # Perform necessary preprocessing steps like tokenization, stopword removal, stemming, lemmatization, etc.
    cleaned_text = []
    for text in sentences:
        cleaned_text.append(preprocess_single_text(text))
    return cleaned_text

def preprocess_single_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove all tokens which are non-alphabetic
    words = [word for word in tokens if word.isalpha()]
    # Lowercase all words
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatize all words into a new list
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Return a list of words
    return ' '.join(words)

def build_model(tokenizer):
    # Build the model
    embedding_dim = 16

    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=20))
    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_model(model, x_train, y_train, x_test, y_test):
    # Train the model
    model.fit(x_train, y_train, epochs=10, batch_size=2, validation_data=(x_test, y_test))

def evaluate_model(model, x_test, y_test):
    # Evaluate the model
    loss, accuracy = model.evaluate(x_test, y_test)
    print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

def predict_and_print_labels(model, some_input, tokenizer):
    # Predict labels for test sentences and print the results
    converted_input = convert_input(some_input)
    predictions = model.predict(converted_input)

    for i in range(len(test_sentences)):
        label = "Positive" if predictions[i] > 0.5 else "Negative"
        print(f"Text: {test_sentences[i]}, Predicted label: {label}")
    
    return predictions

    
def convert_input(test_sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(test_sentences)
    sequences = tokenizer.texts_to_sequences(test_sentences)
    padded_sequences = pad_sequences(sequences, maxlen=20, padding='post', truncating='post')

    return padded_sequences


def save_model(model, filename):
    # Save the model to a file
    model.save(filename)
    print(f"Model saved to {filename}")

def load_saved_model(filename):
    # Load a saved model from a file
    loaded_model = tf.keras.models.load_model(filename)
    print(f"Model loaded from {filename}")
    return loaded_model

def create_binary_model(file_path,label_col):
    df = load_data(file_path)
    features = df['Obs']
    labels = df[label_col]
    x_train, x_test, y_train, y_test,tokenizer = split_data(features,labels)
    model = build_model(tokenizer)
    train_model(model, x_train, y_train, x_test, y_test)
    evaluate_model(model, x_test, y_test)
    save_model(model, f'trained_model_{label_col}.h5')
# loaded_model = load_saved_model('trained_model1.h5')

def create_testers(df,label_colummn,x,y):
    test_sentences = df['Obs'][x:y].tolist()
    test_labels = df[label_colummn][x:y]
    all_labels = df[['A1','A2','A3','B1','B2','B3','B4']][x:y]

    return test_sentences,test_labels,all_labels



# Calculate precision, recall, and F1-score
def calculate_metrics(y_true, y_pred):
    precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred)
    return precision, recall, f1_score

def save_results(predictions, precision, recall, f1_score, filename):
    with open(filename, "wb") as f:
        joblib.dump({"predictions": predictions, "precision": precision, "recall": recall, "f1_score": f1_score}, f)

def predict_ensemble(models, X):
    predictions = []
    for model in models:
   
        predictions.append(model.predict(X))

  # Take the average of the predictions from each model.
    

    return predictions

def convert_predictions_to_binary(predictions):
    # Convert predictions to binary
    binary_predictions = []
    for prediction in predictions:
        binary_prediction = []
        for value in prediction:
            if value > 0.5:
                binary_prediction.append(1)
            else:
                binary_prediction.append(0)
        binary_predictions.append(binary_prediction)
    return binary_predictions

    
create_binary_model('Problem_Dataset.csv','A1')



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Manoj Patil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Manoj
[nltk_data]     Patil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.3635207712650299, Test Accuracy: 0.9095237851142883
Model saved to trained_model_A1.h5


  saving_api.save_model(


In [146]:

create_binary_model('Problem_Dataset.csv','A2')
create_binary_model('Problem_Dataset.csv','A3')
create_binary_model('Problem_Dataset.csv','B1')
create_binary_model('Problem_Dataset.csv','B2')
create_binary_model('Problem_Dataset.csv','B3')
create_binary_model('Problem_Dataset.csv','B4')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.2593272030353546, Test Accuracy: 0.9238095283508301
Model saved to trained_model_A2.h5


  saving_api.save_model(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.3005546033382416, Test Accuracy: 0.9333333373069763
Model saved to trained_model_A3.h5


  saving_api.save_model(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.17752084136009216, Test Accuracy: 0.9523809552192688
Model saved to trained_model_B1.h5


  saving_api.save_model(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.09930545091629028, Test Accuracy: 0.9666666388511658
Model saved to trained_model_B2.h5


  saving_api.save_model(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.12668044865131378, Test Accuracy: 0.9523809552192688
Model saved to trained_model_B3.h5


  saving_api.save_model(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.06525935977697372, Test Accuracy: 0.9714285731315613
Model saved to trained_model_B4.h5


  saving_api.save_model(


In [147]:
# Example usage for prediction
test_sentences,test_labels,all_labelss = create_testers(df,'A1',0,10)

loaded_model = load_saved_model('trained_model_A1.h5')
predeictionsss = predict_and_print_labels(loaded_model, test_sentences, tokenizer)
print(predeictionsss)
binary_predeictionsss = convert_predictions_to_binary(predeictionsss)

print(binary_predeictionsss)
print(test_labels)
print(all_labelss)
precision, recall, f1_score = calculate_metrics(test_labels, binary_predeictionsss)

print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1_score}")

save_results(binary_predeictionsss, precision, recall, f1_score, "results.pkl")




Model loaded from trained_model_A1.h5
Text: Observed child fixated on a particular texture, rubbing a piece of sandpaper continuously throughout the appointment., Predicted label: Negative
Text: Patient's focus centers on vacuum cleaners, studying different models and their components., Predicted label: Negative
Text: Displays a strong interest in smelling various objects, often lingering on scents for prolonged periods., Predicted label: Negative
Text: Patient's attachment to a specific book is evident, quoting passages frequently., Predicted label: Negative
Text: Limited awareness of personal boundaries, invades others' personal space., Predicted label: Positive
Text: Repeatedly opens and closes doors, seemingly fascinated by the motion of hinges., Predicted label: Negative
Text: Engages in repetitive smelling and touching of objects in her environment., Predicted label: Negative
Text: Has difficulties in understanding social cues and often misinterprets facial expressions and body l

  _warn_prf(average, modifier, msg_start, len(result))


In [148]:

test1,test2,test3 = create_testers(df,'A1',1,2)
converted_test1 = convert_input(test1)
print(test3)
print(converted_test1)
ensemble_predictions = predict_ensemble(models, converted_test1)
binary_ensemble_predictions = convert_predictions_to_binary(ensemble_predictions)

print(ensemble_predictions)
print(binary_ensemble_predictions)

ensemble_precision, ensemble_recall, ensemble_f1_score = calculate_metrics(test3, ensemble_predictions)
print(f"Precision: {ensemble_precision}, Recall: {ensemble_recall}, F1-score: {ensemble_f1_score}")


   A1  A2  A3  B1  B2  B3  B4
1   0   0   0   0   0   1   0
[[ 1  2  3  4  5  6  7  8  9 10 11 12  0  0  0  0  0  0  0  0]]
[array([[0.32294694]], dtype=float32), array([[2.513604e-05]], dtype=float32), array([[0.5854795]], dtype=float32), array([[0.00030022]], dtype=float32), array([[1.231672e-05]], dtype=float32), array([[0.09366304]], dtype=float32), array([[0.00017045]], dtype=float32)]
[[0], [0], [1], [0], [0], [0], [0]]


ValueError: Found input variables with inconsistent numbers of samples: [1, 7]

In [None]:
test1,test2,test3 = create_testers(df,'A3',0,1)
converted_test1 = convert_input(test1)

In [None]:
print(converted_test1)

[[ 2  3  4  5  1  6  7  8  1  9 10 11 12 13 14 15  0  0  0  0]]


In [None]:
ensemble_predictions = predict_ensemble(models, converted_test1)

print(ensemble_predictions)

[array([[0.07126191]], dtype=float32), array([[0.00241495]], dtype=float32), array([[0.9489908]], dtype=float32), array([[0.00231021]], dtype=float32), array([[3.33789e-05]], dtype=float32), array([[0.39199013]], dtype=float32), array([[0.04412718]], dtype=float32)]
