In [23]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import cv2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skimage.feature import hog
import joblib
import os

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Debug: Print column names to ensure they are correct
print("Columns in the DataFrame:", df.columns)

# Filter the dataset to include only rows where "Key_Frame" is 'Y' and 'General Percentage' is between 0 and 90.0
if 'Key_Frame' not in df.columns or 'Frame Path' not in df.columns or 'Senti' not in df.columns or 'General Percentage' not in df.columns:
    raise ValueError("Required columns are missing from the DataFrame")

df = df[(df['Key_Frame'] == 'Y')]

# Function to preprocess images
def preprocess_image(filepath):
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return None
    img = cv2.imread(filepath)
    if img is None:
        print(f"Failed to read image: {filepath}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img, (48, 48))
    img = cv2.equalizeHist(img)
    return img

# Apply preprocessing to each image and prepare labels
images = []
labels = []
for idx, fp in enumerate(df['Frame Path']):
    processed_img = preprocess_image(fp)
    if processed_img is not None:
        images.append(processed_img)
        labels.append(df['Senti'].iloc[idx])

images = np.array(images)
labels = np.array(labels)

# Extract HOG features from the images
hog_features = [hog(image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys') for image in images]
X = np.array(hog_features)

# Load the saved model
joblib_file = "/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_2_random_forest_model.pkl"
loaded_rf_model = joblib.load(joblib_file)

# Evaluate the model
y_pred = loaded_rf_model.predict(X)

# Calculate evaluation metrics
accuracy = accuracy_score(labels, y_pred)
precision = precision_score(labels, y_pred, average='weighted')
recall = recall_score(labels, y_pred, average='weighted')
f1 = f1_score(labels, y_pred, average='weighted')

# Print evaluation metrics
print(f"Validation accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['Random Forest'],
    'Model Path': [joblib_file],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Save the DataFrame to a CSV file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Columns in the DataFrame: Index(['Video Name', 'Frame Path', 'Time in Seconds', 'Key_Frame',
       'Positive Sentiment', 'Negative Sentiment', 'key_frame', 'Senti',
       'General Percentage'],
      dtype='object')
Validation accuracy: 25.55%
Precision: 0.78
Recall: 0.26
F1 Score: 0.35
Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv


In [None]:
import pandas as pd

# Read the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/work2/Final/validation_results.csv')

# Show the first 5 rows
results_df.head()


Unnamed: 0,Model Name,Model Path,Accuracy,Precision,Recall,F1 Score
0,Random Forest,/content/drive/MyDrive/work2/Final/Image_Model...,0.255528,0.783576,0.255528,0.350567
1,MobileNetV2,/content/drive/MyDrive/work2/Final/Image_Model...,0.366914,0.346832,0.366914,0.329704


In [None]:
import pandas as pd
import numpy as np
import cv2
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.utils import to_categorical
import os

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Function to preprocess images
def preprocess_image(filepath):
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return None
    img = cv2.imread(filepath)
    if img is None:
        print(f"Failed to read image: {filepath}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))  # Resize to match MobileNetV2 input
    img = preprocess_input(img)  # Preprocess as per MobileNetV2 requirements
    return img

# Apply preprocessing to each image and prepare labels
images = []
labels = []
for idx, fp in enumerate(df['Frame Path']):
    processed_img = preprocess_image(fp)
    if processed_img is not None:
        images.append(processed_img)
        labels.append(df['Senti'].iloc[idx])

images = np.array(images)
labels = np.array(labels)

# One-hot encode labels for categorical crossentropy
labels = to_categorical(labels, num_classes=3)

# Load the saved TensorFlow model
model_path = '/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_1_CNN_with_MobileNetV2.h5'
loaded_model = tf.keras.models.load_model(model_path)

# Evaluate the loaded model on the entire dataset
test_loss, test_accuracy = loaded_model.evaluate(images, labels)
print(f"Validation Loss: {test_loss}, Validation Accuracy: {test_accuracy}")

# Make predictions
y_pred_prob = loaded_model.predict(images)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(labels, axis=1)

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['MobileNetV2'],
    'Model Path': [model_path],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Validation Loss: 1.775099754333496, Validation Accuracy: 0.3669135868549347
Accuracy: 36.69%
Precision: 0.35
Recall: 0.37
F1 Score: 0.33
Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv


In [None]:
import librosa
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import joblib

# Function to extract audio features
def extract_features(file_path):
    y, sr = librosa.load(file_path)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
    return np.concatenate((mfccs, chroma, mel, spectral_contrast, tonnetz))

# Folder containing audio files
audio_folder = '/content/drive/MyDrive/work2/Validation_Data/Audios'

# List to store audio file paths and sentiment labels
file_paths = []
sentiments = []

# Iterate through files in the folder and extract sentiment labels
for file_name in os.listdir(audio_folder):
    if os.path.isfile(os.path.join(audio_folder, file_name)):
        file_paths.append(os.path.join(audio_folder, file_name))

# Read sentiment labels from a CSV file
csv_path = '/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv'
df = pd.read_csv(csv_path)
sentiments = df['Senti'].tolist()

# Create a dataset (X, y) with features and labels
X = [extract_features(path) for path in file_paths]
y = sentiments

# Load the scaler
scaler_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest_Scaler.joblib'
scaler = joblib.load(scaler_path)

# Standardize the features
X_scaled = scaler.transform(X)

# Load the saved model
model_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest.joblib'
loaded_model = joblib.load(model_path)

# Evaluate the loaded model
y_pred = loaded_model.predict(X_scaled)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
f1 = f1_score(y, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['Audio Sentiment RF'],
    'Model Path': [model_path],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Accuracy: 34.48%
Precision: 0.25
Recall: 0.34
F1 Score: 0.26
Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import os
import librosa

# Function to extract features from audio files
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=3)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    features = np.mean(np.concatenate((mfccs, chroma, mel, contrast), axis=0), axis=1)
    return features

# Folder containing audio files
audio_folder = '/content/drive/MyDrive/work2/Validation_Data/Audios'

# List to store audio file paths and sentiment labels
file_names = []
senti_values = []

# Iterate through files in the folder
for file_name in os.listdir(audio_folder):
    if os.path.isfile(os.path.join(audio_folder, file_name)):
        file_names.append(os.path.join(audio_folder, file_name))

# Load sentiment labels from a CSV file
csv_path = '/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv'
df = pd.read_csv(csv_path)
senti_values = df['Senti'].tolist()

# Create a dataset (X, y) with features and labels
X = np.array([extract_features(path) for path in file_names])
y = np.array(senti_values)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Load the scaler
scaler_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_1_LSTM_Scaler.pkl'
scaler = joblib.load(scaler_path)

# Standardize the features
X_scaled = scaler.transform(X)

# Reshape for LSTM input
X_scaled = np.expand_dims(X_scaled, -1)

# Load the saved best model
model_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_1_LSTM.h5'
best_model = tf.keras.models.load_model(model_path)

# Evaluate the loaded model on the entire dataset
loss, accuracy = best_model.evaluate(X_scaled, y)
print(f"Validation Loss: {loss}, Validation Accuracy: {accuracy}")

# Make predictions
y_pred_prob = best_model.predict(X_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate evaluation metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
f1 = f1_score(y, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['LSTM'],
    'Model Path': [model_path],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Validation Loss: 1.107116460800171, Validation Accuracy: 0.3103448152542114
Accuracy: 31.03%
Precision: 0.25
Recall: 0.31
F1 Score: 0.26
Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd

# Read the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/work2/Final/updated_validation_results.csv')

# Show the first 5 rows
results_df.head(10)


Unnamed: 0,Model Name,Model Path,Accuracy,Precision,Recall,F1 Score
0,Random Forest,/content/drive/MyDrive/work2/Final/Image_Model...,0.255528,0.783576,0.255528,0.350567
1,MobileNetV2,/content/drive/MyDrive/work2/Final/Image_Model...,0.366914,0.346832,0.366914,0.329704
2,Audio Sentiment RF,/content/drive/MyDrive/work2/Final/Audio_Model...,0.344828,0.247126,0.344828,0.257313
3,LSTM,/content/drive/MyDrive/work2/Final/Audio_Model...,0.310345,0.24939,0.310345,0.264791


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv", encoding='utf-8')


# Tokenize the reviews using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model')

# Tokenize and encode the reviews
input_ids = []
attention_masks = []

for review in df['Sentence']:
    encoded_dict = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=64,
        truncation=True,  # Explicitly activate truncation
        padding='max_length',  # Pad to the max_length
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Do the same for df1
input_ids1 = []
attention_masks1 = []

for review in df1['Sentence']:
    encoded_dict = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=64,
        truncation=True,  # Explicitly activate truncation
        padding='max_length',  # Pad to the max_length
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids1.append(encoded_dict['input_ids'])
    attention_masks1.append(encoded_dict['attention_mask'])

input_ids1 = torch.cat(input_ids1, dim=0)
attention_masks1 = torch.cat(attention_masks1, dim=0)

# Load the labels
labels = torch.tensor(df['Label'])
labels1 = torch.tensor(df1['Label'])

# Concatenate inputs and labels
input_ids = torch.cat((input_ids, input_ids1), dim=0)
attention_masks = torch.cat((attention_masks, attention_masks1), dim=0)
labels = torch.cat((labels, labels1), dim=0)

# Split the dataset into training and validation sets
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Load pre-trained BERT model for sentiment classification
model = BertForSequenceClassification.from_pretrained(
    '/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model',
    num_labels=3,  # 3 classes: neutral, positive, negative
    output_attentions=False,
    output_hidden_states=False
)

# Set up optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Evaluation loop
model.eval()
all_labels = []
all_preds = []

for batch in val_dataloader:
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2]
    }
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    preds = np.argmax(logits.cpu().numpy(), axis=1)

    all_labels.extend(inputs['labels'].cpu().numpy())
    all_preds.extend(preds)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Generate confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print('Confusion Matrix:')
print(conf_matrix)

# Print classification report
class_report = classification_report(all_labels, all_preds, target_names=['neutral', 'positive', 'negative'])
print('Classification Report:')
print(class_report)

# Visualize the results
# Calculate precision, recall, and F1-score
class_report_dict = classification_report(all_labels, all_preds, target_names=['neutral', 'positive', 'negative'], output_dict=True)
precision = [class_report_dict[label]['precision'] for label in ['neutral', 'positive', 'negative']]
recall = [class_report_dict[label]['recall'] for label in ['neutral', 'positive', 'negative']]
f1_score = [class_report_dict[label]['f1-score'] for label in ['neutral', 'positive', 'negative']]

# Create an interactive confusion matrix
fig = make_subplots(rows=2, cols=2, subplot_titles=['Confusion Matrix', 'Accuracy Bar Chart', 'Precision Bar Chart', 'Recall Bar Chart'])

# Confusion Matrix
trace_heatmap = go.Heatmap(z=conf_matrix, x=['neutral', 'positive', 'negative'], y=['neutral', 'positive', 'negative'], colorscale='Viridis')
fig.add_trace(trace_heatmap, row=1, col=1)

# Accuracy Bar Chart
trace_bar_accuracy = go.Bar(x=['Accuracy'], y=[accuracy * 100], marker=dict(color='blue'))
fig.add_trace(trace_bar_accuracy, row=1, col=2)

# Precision Bar Chart
trace_bar_precision = go.Bar(x=['neutral', 'positive', 'negative'], y=np.round(precision, 2) * 100, marker=dict(color='green'))
fig.add_trace(trace_bar_precision, row=2, col=1)

# Recall Bar Chart
trace_bar_recall = go.Bar(x=['neutral', 'positive', 'negative'], y=np.round(recall, 2) * 100, marker=dict(color='orange'))
fig.add_trace(trace_bar_recall, row=2, col=2)

fig.update_layout(title_text='Confusion Matrix, Accuracy, Precision, and Recall', height=600, width=800)
fig.show()

# Save the results to a CSV file
results_df = pd.DataFrame({
    'Model Name': ['BERT Multilingual'],
    'Model Path': ['/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model'],
    'Accuracy': [accuracy],
    'Precision': [np.mean(precision)],
    'Recall': [np.mean(recall)],
    'F1 Score': [np.mean(f1_score)]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv", encoding='utf-8')

# Print the column names to check for 'Sentence'
print("Columns in the DataFrame:", df.columns)

# Assuming the correct column names are 'Text' and 'Label' (you may need to adjust these based on the actual column names)
sentence_column = 'Transcription'
label_column = 'Senti'

# Tokenize the reviews using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model')

# Tokenize and encode the reviews
input_ids = []
attention_masks = []

for review in df[sentence_column]:
    encoded_dict = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=64,
        truncation=True,  # Explicitly activate truncation
        padding='max_length',  # Pad to the max_length
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Load the labels
labels = torch.tensor(df[label_column].values)

# Create a dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

# Load pre-trained BERT model for sentiment classification
model = BertForSequenceClassification.from_pretrained(
    '/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model',
    num_labels=3,  # 3 classes: neutral, positive, negative
    output_attentions=False,
    output_hidden_states=False
)

# Evaluation loop
model.eval()
all_labels = []
all_preds = []

for batch in dataloader:
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2]
    }
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    preds = np.argmax(logits.cpu().numpy(), axis=1)

    all_labels.extend(inputs['labels'].cpu().numpy())
    all_preds.extend(preds)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Generate confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print('Confusion Matrix:')
print(conf_matrix)

# Print classification report
class_report = classification_report(all_labels, all_preds, target_names=['neutral', 'positive', 'negative'])
print('Classification Report:')
print(class_report)

# Visualize the results
# Calculate precision, recall, and F1-score
class_report_dict = classification_report(all_labels, all_preds, target_names=['neutral', 'positive', 'negative'], output_dict=True)
precision = [class_report_dict[label]['precision'] for label in ['neutral', 'positive', 'negative']]
recall = [class_report_dict[label]['recall'] for label in ['neutral', 'positive', 'negative']]
f1 = [class_report_dict[label]['f1-score'] for label in ['neutral', 'positive', 'negative']]

# Create an interactive confusion matrix
fig = make_subplots(rows=2, cols=2, subplot_titles=['Confusion Matrix', 'Accuracy Bar Chart', 'Precision Bar Chart', 'Recall Bar Chart'])

# Confusion Matrix
trace_heatmap = go.Heatmap(z=conf_matrix, x=['neutral', 'positive', 'negative'], y=['neutral', 'positive', 'negative'], colorscale='Viridis')
fig.add_trace(trace_heatmap, row=1, col=1)

# Accuracy Bar Chart
trace_bar_accuracy = go.Bar(x=['Accuracy'], y=[accuracy * 100], marker=dict(color='blue'))
fig.add_trace(trace_bar_accuracy, row=1, col=2)

# Precision Bar Chart
trace_bar_precision = go.Bar(x=['neutral', 'positive', 'negative'], y=np.round(precision, 2) * 100, marker=dict(color='green'))
fig.add_trace(trace_bar_precision, row=2, col=1)

# Recall Bar Chart
trace_bar_recall = go.Bar(x=['neutral', 'positive', 'negative'], y=np.round(recall, 2) * 100, marker=dict(color='orange'))
fig.add_trace(trace_bar_recall, row=2, col=2)

fig.update_layout(title_text='Confusion Matrix, Accuracy, Precision, and Recall', height=600, width=800)
fig.show()

# Save the results to a CSV file
results_df = pd.DataFrame({
    'Model Name': ['BERT Multilingual'],
    'Model Path': ['/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model'],
    'Accuracy': [accuracy],
    'Precision': [np.mean(precision)],
    'Recall': [np.mean(recall)],
    'F1 Score': [np.mean(f1)]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Columns in the DataFrame: Index(['File', 'Transcription', 'Status', 'Length', 'WordCount',
       'negative_count', 'positive_count', 'negative_positions',
       'positive_positions', 'Senti', 'Negative_Frame_Sentiments',
       'Positive_Frame_Sentiments'],
      dtype='object')
Accuracy: 41.38%
Confusion Matrix:
[[1 5 3]
 [0 4 4]
 [1 4 7]]
Classification Report:
              precision    recall  f1-score   support

     neutral       0.50      0.11      0.18         9
    positive       0.31      0.50      0.38         8
    negative       0.50      0.58      0.54        12

    accuracy                           0.41        29
   macro avg       0.44      0.40      0.37        29
weighted avg       0.45      0.41      0.38        29



Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv


In [None]:
import pandas as pd

# Read the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/work2/Final/validation_results.csv')

# Show the first 5 rows
results_df.head(10)


Unnamed: 0,Model Name,Model Path,Accuracy,Precision,Recall,F1 Score
0,Random Forest,/content/drive/MyDrive/work2/Final/Image_Model...,0.255528,0.783576,0.255528,0.350567
1,MobileNetV2,/content/drive/MyDrive/work2/Final/Image_Model...,0.366914,0.346832,0.366914,0.329704
2,Audio Sentiment RF,/content/drive/MyDrive/work2/Final/Audio_Model...,0.344828,0.247126,0.344828,0.257313
3,LSTM,/content/drive/MyDrive/work2/Final/Audio_Model...,0.310345,0.24939,0.310345,0.264791
4,BERT Multilingual,/content/drive/MyDrive/work2/Final/Text_Work/T...,0.413793,0.435897,0.398148,0.367077


In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import drive

# Function to remove unwanted characters
def removing_unwanted_data(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text

# Custom tokenizer function
def custom_tokenizer(doc):
    return doc

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv", encoding='utf-8')

# Apply preprocessing
df['text_cleaned'] = df['Transcription'].map(removing_unwanted_data)

# Load the saved vectorizer and model
vectorizer_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_bow_vectorizer.pkl'
model_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_logistic_regression_bow.pkl'

vectorizer = joblib.load(vectorizer_path)
model = joblib.load(model_path)

# Transform the data using the loaded vectorizer
X = vectorizer.transform(df['Transcription'])
y = df['Senti']

# Evaluate the loaded model on the complete dataset
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
f1 = f1_score(y, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['BOW Logistic Regression'],
    'Model Path': [model_path],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'

# Check if the results file exists and handle the append operation correctly
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    updated_df = pd.concat([existing_df, results_df], ignore_index=True)
    updated_df.to_csv(results_file, index=False)
else:
    results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Accuracy: 41.38%
Precision: 0.17
Recall: 0.41
F1 Score: 0.24
Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
import pandas as pd

# Read the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/work2/Final/validation_results.csv')

# Show the first 5 rows
results_df.head(10)

Unnamed: 0,Model Name,Model Path,Accuracy,Precision,Recall,F1 Score
0,Random Forest,/content/drive/MyDrive/work2/Final/Image_Model...,0.255528,0.783576,0.255528,0.350567
1,MobileNetV2,/content/drive/MyDrive/work2/Final/Image_Model...,0.366914,0.346832,0.366914,0.329704
2,Audio Sentiment RF,/content/drive/MyDrive/work2/Final/Audio_Model...,0.344828,0.247126,0.344828,0.257313
3,LSTM,/content/drive/MyDrive/work2/Final/Audio_Model...,0.310345,0.24939,0.310345,0.264791
4,BERT Multilingual,/content/drive/MyDrive/work2/Final/Text_Work/T...,0.413793,0.435897,0.398148,0.367077
5,BOW Logistic Regression,/content/drive/MyDrive/work2/Final/Text_Work/T...,0.413793,0.171225,0.413793,0.24222


In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

# Function to remove unwanted characters
def removing_unwanted_data(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text

# Custom tokenizer function
def custom_tokenizer(doc):
    return doc

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv", encoding='utf-8')

# Apply preprocessing
df['text_cleaned'] = df['Transcription'].map(removing_unwanted_data)

# Load the saved vectorizer and model
vectorizer_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_bow_vectorizer.pkl'
model_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_logistic_regression_bow.pkl'

vectorizer = joblib.load(vectorizer_path)
model = joblib.load(model_path)

# Transform the data using the loaded vectorizer
X = vectorizer.transform(df['Transcription'])
y = df['Senti']

# Evaluate the loaded model on the complete dataset
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
f1 = f1_score(y, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['BOW Logistic Regression'],
    'Model Path': [model_path],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'
max_accuracy_file = '/content/drive/MyDrive/work2/Final/max_accuracy.txt'

# Check if the results file exists and handle the append operation correctly
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    updated_df = pd.concat([existing_df, results_df], ignore_index=True)
    updated_df.to_csv(results_file, index=False)
else:
    results_df.to_csv(results_file, index=False)

# Update max accuracy if necessary
max_accuracy = accuracy
if os.path.exists(max_accuracy_file):
    with open(max_accuracy_file, 'r') as file:
        max_accuracy = max(max_accuracy, float(file.read()))

with open(max_accuracy_file, 'w') as file:
    file.write(str(max_accuracy))

print(f"Results saved to {results_file}")
print(f"Maximum accuracy: {max_accuracy * 100:.2f}%")


Accuracy: 41.38%
Precision: 0.17
Recall: 0.41
F1 Score: 0.24
Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv
Maximum accuracy: 41.38%



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

# Function to remove unwanted characters
def removing_unwanted_data(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text

# Custom tokenizer function
def custom_tokenizer(doc):
    return doc

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv", encoding='utf-8')

# Apply preprocessing
df['text_cleaned'] = df['Transcription'].map(removing_unwanted_data)

# Load the saved vectorizer and model
vectorizer_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_bow_vectorizer.pkl'
model_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_logistic_regression_bow.pkl'

vectorizer = joblib.load(vectorizer_path)
model = joblib.load(model_path)

# Transform the data using the loaded vectorizer
X = vectorizer.transform(df['Transcription'])
y = df['Senti']

# Evaluate the loaded model on the complete dataset
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted', zero_division=0)
recall = recall_score(y, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['BOW Logistic Regression'],
    'Model Path': [model_path],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Final/validation_results.csv'
max_accuracy_file = '/content/drive/MyDrive/work2/Final/max_accuracy.txt'

# Check if the results file exists and handle the append operation correctly
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    updated_df = pd.concat([existing_df, results_df], ignore_index=True)
    updated_df.to_csv(results_file, index=False)
else:
    results_df.to_csv(results_file, index=False)

# Update max accuracy if necessary
max_accuracy = accuracy
if os.path.exists(max_accuracy_file):
    with open(max_accuracy_file, 'r') as file:
        max_accuracy = max(max_accuracy, float(file.read()))

with open(max_accuracy_file, 'w') as file:
    file.write(str(max_accuracy))

print(f"Results saved to {results_file}")
print(f"Maximum accuracy: {max_accuracy * 100:.2f}%")


Accuracy: 41.38%
Precision: 0.17
Recall: 0.41
F1 Score: 0.24
Results saved to /content/drive/MyDrive/work2/Final/validation_results.csv
Maximum accuracy: 41.38%


In [None]:
import pandas as pd

# Read the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/work2/Final/validation_results.csv')

# Show the first 5 rows
results_df.head(10)

Unnamed: 0,Model Name,Model Path,Accuracy,Precision,Recall,F1 Score
0,Random Forest,/content/drive/MyDrive/work2/Final/Image_Model...,0.255528,0.783576,0.255528,0.350567
1,MobileNetV2,/content/drive/MyDrive/work2/Final/Image_Model...,0.366914,0.346832,0.366914,0.329704
2,Audio Sentiment RF,/content/drive/MyDrive/work2/Final/Audio_Model...,0.344828,0.247126,0.344828,0.257313
3,LSTM,/content/drive/MyDrive/work2/Final/Audio_Model...,0.310345,0.24939,0.310345,0.264791
4,BERT Multilingual,/content/drive/MyDrive/work2/Final/Text_Work/T...,0.413793,0.435897,0.398148,0.367077
5,BOW Logistic Regression,/content/drive/MyDrive/work2/Final/Text_Work/T...,0.413793,0.171225,0.413793,0.24222


In [None]:
# prompt: delete rows at index 6 and 7 . Then save results in same CSV file

import pandas as pd

# Load the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/work2/Final/validation_results.csv')

# Delete rows at index 6 and 7
results_df = results_df.drop([6, 7])

# Save results in the same CSV file
results_df.to_csv('/content/drive/MyDrive/work2/Final/validation_results.csv', index=False)

# Print the updated DataFrame
print(results_df)


                Model Name                                         Model Path  \
0            Random Forest  /content/drive/MyDrive/work2/Final/Image_Model...   
1              MobileNetV2  /content/drive/MyDrive/work2/Final/Image_Model...   
2       Audio Sentiment RF  /content/drive/MyDrive/work2/Final/Audio_Model...   
3                     LSTM  /content/drive/MyDrive/work2/Final/Audio_Model...   
4        BERT Multilingual  /content/drive/MyDrive/work2/Final/Text_Work/T...   
5  BOW Logistic Regression  /content/drive/MyDrive/work2/Final/Text_Work/T...   

   Accuracy  Precision    Recall  F1 Score  
0  0.255528   0.783576  0.255528  0.350567  
1  0.366914   0.346832  0.366914  0.329704  
2  0.344828   0.247126  0.344828  0.257313  
3  0.310345   0.249390  0.310345  0.264791  
4  0.413793   0.435897  0.398148  0.367077  
5  0.413793   0.171225  0.413793  0.242220  


In [None]:
import pandas as pd
import numpy as np
import joblib
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

# Load the CSV files
audio_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv')
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')

# Select only the first record from Videos.csv
video_df = video_df.head(1)

# Load the pre-trained Random Forest audio model
audio_rf_model_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest.joblib'
audio_rf_scaler_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest_Scaler.joblib'

audio_rf_model = joblib.load(audio_rf_model_path)
audio_rf_scaler = joblib.load(audio_rf_scaler_path)

# Folder containing audio files
audio_folder = '/content/drive/MyDrive/work2/Validation_Data/Audios'

# Helper function for audio feature extraction
def preprocess_audio(file_path):
    try:
        y, sr = librosa.load(file_path)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
        mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1)
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
        return np.concatenate((mfccs, chroma, mel, spectral_contrast, tonnetz))
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Prediction function for Random Forest audio model
def predict_audio_rf(file_path):
    features = preprocess_audio(file_path)
    if features is not None:
        features_scaled = audio_rf_scaler.transform([features])
        return audio_rf_model.predict(features_scaled)[0]
    else:
        return None

# Perform predictions and evaluate
predictions = []
for index, row in video_df.iterrows():
    try:
        # Correctly locate the audio file in the specified folder
        audio_path = os.path.join(audio_folder, row['File'])
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"{audio_path} does not exist")

        print(f"Audio File: {audio_path}")
        prediction = predict_audio_rf(audio_path)
        predictions.append(prediction)
    except (IndexError, FileNotFoundError) as e:
        print(f"Error: {e}")
        predictions.append(None)

video_df['Predicted_Senti'] = predictions

# Drop rows where prediction is None
video_df = video_df.dropna(subset=['Predicted_Senti'])

# Ensure sentiment values are integers
video_df['Predicted_Senti'] = video_df['Predicted_Senti'].astype(int)
video_df['Senti'] = video_df['Senti'].astype(int)

# Evaluate the predictions
accuracy = accuracy_score(video_df['Senti'], video_df['Predicted_Senti'])
precision = precision_score(video_df['Senti'], video_df['Predicted_Senti'], average='weighted', zero_division=0)
recall = recall_score(video_df['Senti'], video_df['Predicted_Senti'], average='weighted', zero_division=0)
f1 = f1_score(video_df['Senti'], video_df['Predicted_Senti'], average='weighted')

# Save results in a pandas DataFrame
results_df = pd.DataFrame({
    'Model Name': ['Audio Random Forest'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

results_df.to_csv('/content/drive/MyDrive/work2/Validation_Data/results_audio_rf.csv', index=False)
print(results_df)


Audio File: /content/drive/MyDrive/work2/Validation_Data/Audios/1. Mobile_Neutral.wav
            Model Name  Accuracy  Precision  Recall  F1 Score
0  Audio Random Forest       0.0        0.0     0.0       0.0


In [None]:
import pandas as pd
import numpy as np
import joblib
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

# Load the CSV files
audio_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv')
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')

# Load the pre-trained Random Forest audio model
audio_rf_model_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest.joblib'
audio_rf_scaler_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest_Scaler.joblib'

audio_rf_model = joblib.load(audio_rf_model_path)
audio_rf_scaler = joblib.load(audio_rf_scaler_path)

# Folder containing audio files
audio_folder = '/content/drive/MyDrive/work2/Validation_Data/Audios'

# Helper function for audio feature extraction
def preprocess_audio(file_path):
    try:
        y, sr = librosa.load(file_path)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
        mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1)
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
        return np.concatenate((mfccs, chroma, mel, spectral_contrast, tonnetz))
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Prediction and evaluation function for Random Forest audio model
results = []
for index, row in video_df.iterrows():
    audio_path = os.path.join(audio_folder, row['File'])
    if os.path.exists(audio_path):
        label = audio_df.loc[audio_df['File'] == row['File'], 'Senti'].values[0]
        prediction = predict_audio_rf(audio_path)
        results.append({
            'Audio File': row['File'],
            'Actual Label': label,
            'Predicted Label': prediction
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display final results including the video labels
#print(results_df)

# Evaluate model performance
#actual_labels = results_df['Actual Label'].tolist()
#predicted_labels = results_df['Predicted Label'].tolist()

accuracy = accuracy_score(actual_labels, predicted_labels)
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

#1. Audio Model Validation Random Forest

In [69]:
import pandas as pd
import numpy as np
import joblib
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

# Load the CSV files
audio_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv')
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')

# Load the pre-trained Random Forest audio model
audio_rf_model_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest.joblib'
audio_rf_scaler_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest_Scaler.joblib'

audio_rf_model = joblib.load(audio_rf_model_path)
audio_rf_scaler = joblib.load(audio_rf_scaler_path)

# Folder containing audio files
audio_folder = '/content/drive/MyDrive/work2/Validation_Data/Audios'

# Helper function for audio feature extraction
def preprocess_audio(file_path):
    try:
        y, sr = librosa.load(file_path)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
        mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1)
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
        return np.concatenate((mfccs, chroma, mel, spectral_contrast, tonnetz))
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None
# Prediction function for Random Forest audio model
def predict_audio_rf(file_path):
    features = preprocess_audio(file_path)
    if features is not None:
        features_scaled = audio_rf_scaler.transform([features])
        return audio_rf_model.predict(features_scaled)[0]
    else:
        return None

# Prediction and evaluation function for Random Forest audio model
results = []
for index, row in video_df.iterrows():
    audio_path = os.path.join(audio_folder, row['File'])
    if os.path.exists(audio_path):
        label = audio_df.loc[audio_df['File'] == row['File'], 'Senti'].values[0]
        prediction = predict_audio_rf(audio_path)
        results.append({
            'Audio File': row['File'],
            'Actual Label': label,
            'Predicted Label': prediction
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display final results including the video labels
print(results_df)

# Evaluate model performance
actual_labels = results_df['Actual Label'].tolist()
predicted_labels = results_df['Predicted Label'].tolist()

accuracy = accuracy_score(actual_labels, predicted_labels)
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Metrics dictionary
metrics = {
    'Model': 'Random Forest',
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
}

# Convert dictionary to DataFrame
metrics_df = pd.DataFrame([metrics])

# Save to CSV
metrics_df.to_csv('/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv', index=False)

print("Model evaluation metrics saved successfully.")

                                           Audio File  Actual Label  \
0                               1. Mobile_Neutral.wav             0   
1              1. Mobile__Positive_Product_Review.wav             0   
2                  1. Tea_Negative_Product_Review.wav             2   
3      10. Blasphemy_Negative_Perception_Building.wav             1   
4   10. Political Extremisim_Negative_Perception_B...             2   
5      11. Blasphemy_Negative_Perception_Building.wav             2   
6   11. Political Extremisim_Negative_Perception_B...             2   
7    2. Communication_Neutral_Perception Building.wav             0   
8               2. Mobile_Positive_Product_Review.wav             1   
9                  2. Tea_Negative_Product_Review.wav             0   
10   3. Communication_Neutral_Perception Building.wav             0   
11                 3. ICE_Negative_Product_Review.wav             2   
12              3. Mobile_Positive_Product_Review.wav             1   
13    

  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
metrics_df.head()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.5,0.375,0.5,0.412186


In [None]:
# prompt: results_df save this csv

results_df.to_csv('/content/drive/MyDrive/work2/Validation_Data/results_audio_rf_temp.csv')


In [None]:
results_df.head(30)

Unnamed: 0,Audio File,Actual Label,Predicted Label
0,1. Mobile_Neutral.wav,0,2
1,1. Mobile__Positive_Product_Review.wav,0,1
2,1. Tea_Negative_Product_Review.wav,2,1
3,10. Blasphemy_Negative_Perception_Building.wav,1,1
4,10. Political Extremisim_Negative_Perception_B...,2,2
5,11. Blasphemy_Negative_Perception_Building.wav,2,1
6,11. Political Extremisim_Negative_Perception_B...,2,2
7,2. Communication_Neutral_Perception Building.wav,0,1
8,2. Mobile_Positive_Product_Review.wav,1,2
9,2. Tea_Negative_Product_Review.wav,0,1


#2. Audio Model Validatation LSTM

In [70]:
import pandas as pd
import numpy as np
import joblib
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
import tensorflow as tf

# Load the CSV files
audio_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv')
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')

# Load the pre-trained LSTM audio model and scaler
lstm_model_path = '/content/drive/MyDrive/work2/Final/Old_audio_models/Audio_Model_1_LSTM.h5'
lstm_scaler_path = '/content/drive/MyDrive/work2/Final/Old_audio_models/scaler_lstm.pkl'

lstm_model = tf.keras.models.load_model(lstm_model_path)
lstm_scaler = joblib.load(lstm_scaler_path)

# Folder containing audio files
audio_folder = '/content/drive/MyDrive/work2/Validation_Data/Audios'

# Helper function for audio feature extraction
def preprocess_audio_lstm(file_path):
    try:
        y, sr = librosa.load(file_path, duration=3)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mel = librosa.feature.melspectrogram(y=y, sr=sr)
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        energy_contour = librosa.feature.rms(y=y)
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)

        features = np.concatenate((
            np.mean(mfccs, axis=1),
            np.mean(mel, axis=1),
            np.mean(contrast, axis=1),
            np.mean(bandwidth, axis=1),
            np.mean(energy_contour, axis=1),
            np.mean(spectral_rolloff, axis=1)
        ))
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def predict_audio_lstm(file_path):
    features = preprocess_audio_lstm(file_path)
    if features is not None:
        features_scaled = lstm_scaler.transform([features])
        features_scaled = np.expand_dims(features_scaled, -1)
        prediction = lstm_model.predict(features_scaled)
        return np.argmax(prediction, axis=1)[0]
    else:
        return None

# Prediction and evaluation function for the LSTM model
results_lstm = []
for index, row in video_df.iterrows():
    audio_path = os.path.join(audio_folder, row['File'])
    if os.path.exists(audio_path):
        label = audio_df.loc[audio_df['File'] == row['File'], 'Senti'].values[0]
        prediction_lstm = predict_audio_lstm(audio_path)
        results_lstm.append({
            'Audio File': row['File'],
            'Actual Label': label,
            'Predicted Label': prediction_lstm
        })

# Convert results to DataFrame
results_lstm_df = pd.DataFrame(results_lstm)

# Display final results including the video labels
print("LSTM Results:\n", results_lstm_df)

# Evaluate model performance for LSTM
actual_labels_lstm = results_lstm_df['Actual Label'].tolist()
predicted_labels_lstm = results_lstm_df['Predicted Label'].tolist()

accuracy_lstm = accuracy_score(actual_labels_lstm, predicted_labels_lstm)
precision_lstm = precision_score(actual_labels_lstm, predicted_labels_lstm, average='weighted', zero_division=1)
recall_lstm = recall_score(actual_labels_lstm, predicted_labels_lstm, average='weighted')
f1_lstm = f1_score(actual_labels_lstm, predicted_labels_lstm, average='weighted')

print(f"LSTM - Accuracy: {accuracy_lstm*100:.2f}%")
print(f"LSTM - Precision: {precision_lstm:.2f}")
print(f"LSTM - Recall: {recall_lstm:.2f}")
print(f"LSTM - F1 Score: {f1_lstm:.2f}")

# Metrics dictionary for LSTM model
metrics_lstm = {
    'Model': 'LSTM',
    'Accuracy': accuracy_lstm,
    'Precision': precision_lstm,
    'Recall': recall_lstm,
    'F1 Score': f1_lstm
}

# Convert dictionary to DataFrame
metrics_df_lstm = pd.DataFrame([metrics_lstm])

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    metrics_df_lstm = pd.concat([existing_df, metrics_df_lstm], ignore_index=True)

# Save the DataFrame to the CSV file
metrics_df_lstm.to_csv(results_file, index=False)

print("Model evaluation metrics saved successfully.")


LSTM Results:
                                            Audio File  Actual Label  \
0                               1. Mobile_Neutral.wav             0   
1              1. Mobile__Positive_Product_Review.wav             0   
2                  1. Tea_Negative_Product_Review.wav             2   
3      10. Blasphemy_Negative_Perception_Building.wav             1   
4   10. Political Extremisim_Negative_Perception_B...             2   
5      11. Blasphemy_Negative_Perception_Building.wav             2   
6   11. Political Extremisim_Negative_Perception_B...             2   
7    2. Communication_Neutral_Perception Building.wav             0   
8               2. Mobile_Positive_Product_Review.wav             1   
9                  2. Tea_Negative_Product_Review.wav             0   
10   3. Communication_Neutral_Perception Building.wav             0   
11                 3. ICE_Negative_Product_Review.wav             2   
12              3. Mobile_Positive_Product_Review.wav         

In [49]:
metrics_df_lstm.head()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.5,0.375,0.5,0.412186
1,LSTM,0.392857,0.545635,0.392857,0.332512


#3. Frame Model Validatation Random Forest

In [71]:
import numpy as np
import pandas as pd
import cv2
from skimage.feature import hog
import joblib
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the CSV files
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')
frames_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Function to preprocess images
def preprocess_image(filepath):
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return None
    img = cv2.imread(filepath)
    if img is None:
        print(f"Failed to read image: {filepath}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img, (128, 128))
    img = cv2.equalizeHist(img)
    return img

# Load the saved model
joblib_file = "/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_2_random_forest_model.pkl"
loaded_rf_model = joblib.load(joblib_file)

# Initialize lists to store true labels and predictions
all_true_labels = []
all_pred_labels = []

# Process each video
for video_name in video_df['File']:
    video_base_name = video_name.replace('.wav', '')

    # Filter frames related to the current video and are key frames
    video_frames = frames_df[(frames_df['Video Name'].str.replace('.wav', '') == video_base_name) & (frames_df['Key_Frame'] == 'Y')]

    # Check if there are any key frames for this video
    if video_frames.empty:
        print(f"No key frames found for video: {video_name}")
        continue

    # Apply preprocessing to each image and prepare features
    for frame_path, true_label in zip(video_frames['Frame Path'], video_frames['Senti']):
        processed_img = preprocess_image(frame_path)
        if processed_img is not None:
            hog_features = hog(processed_img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys').reshape(1, -1)
            video_pred = loaded_rf_model.predict(hog_features)[0]
            all_true_labels.append(true_label)
            all_pred_labels.append(video_pred)

# Calculate evaluation metrics
accuracy = accuracy_score(all_true_labels, all_pred_labels)
precision = precision_score(all_true_labels, all_pred_labels, average='weighted')
recall = recall_score(all_true_labels, all_pred_labels, average='weighted')
f1 = f1_score(all_true_labels, all_pred_labels, average='weighted')

# Print evaluation metrics
print(f"Overall Accuracy: {accuracy*100:.2f}%")
print(f"Overall Precision: {precision:.2f}")
print(f"Overall Recall: {recall:.2f}")
print(f"Overall F1 Score: {f1:.2f}")

# Prepare results for saving
results_df = pd.DataFrame({
    'Model': ['Random Forest (Video)'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Overall evaluation metrics saved to {results_file}")


No key frames found for video: 9. Blasphemy_Negative_Perception_Building.wav
Overall Accuracy: 56.10%
Overall Precision: 0.57
Overall Recall: 0.56
Overall F1 Score: 0.53
Overall evaluation metrics saved to /content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv


In [51]:
import pandas as pd
# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv')

# Display the first 5 rows
df.head()


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Modality,Video Name,Sentiment,Percentage
0,Random Forest,0.5,0.375,0.5,0.412186,,,,
1,LSTM,0.392857,0.545635,0.392857,0.332512,,,,
2,,,,,,Video Model 2,1. Mobile_Neutral.wav,negative,0.365104
3,,,,,,Video Model 2,1. Mobile__Positive_Product_Review.wav,negative,0.354444
4,,,,,,Video Model 2,1. Tea_Negative_Product_Review.wav,negative,0.359145


#4. Frame Model Validatation CNN

In [72]:
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the CSV files
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')
frames_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Function to preprocess images
def preprocess_image(filepath):
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return None
    img = cv2.imread(filepath)
    if img is None:
        print(f"Failed to read image: {filepath}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))  # Resize to match MobileNetV2 input
    img = preprocess_input(img)  # Preprocess as per MobileNetV2 requirements
    return img

# Load the saved TensorFlow model
model_path = '/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_1_CNN_with_MobileNetV2.h5'
loaded_model = tf.keras.models.load_model(model_path)

# Initialize lists to store true labels and predictions
all_true_labels = []
all_pred_labels = []

# Process each video
for video_name in video_df['File']:
    video_base_name = video_name.replace('.wav', '')

    # Filter frames related to the current video and are key frames
    video_frames = frames_df[(frames_df['Video Name'].str.replace('.wav', '') == video_base_name) & (frames_df['Key_Frame'] == 'Y')]

    # Check if there are any key frames for this video
    if video_frames.empty:
        print(f"No key frames found for video: {video_name}")
        continue

    # Apply preprocessing to each image and prepare features
    for frame_path, true_label in zip(video_frames['Frame Path'], video_frames['Senti']):
        processed_img = preprocess_image(frame_path)
        if processed_img is not None:
            video_image = np.expand_dims(processed_img, axis=0)
            video_pred = loaded_model.predict(video_image)[0]
            all_true_labels.append(true_label)
            all_pred_labels.append(np.argmax(video_pred))

# Calculate evaluation metrics
accuracy = accuracy_score(all_true_labels, all_pred_labels)
precision = precision_score(all_true_labels, all_pred_labels, average='weighted')
recall = recall_score(all_true_labels, all_pred_labels, average='weighted')
f1 = f1_score(all_true_labels, all_pred_labels, average='weighted')

# Print evaluation metrics
print(f"Overall Accuracy: {accuracy*100:.2f}%")
print(f"Overall Precision: {precision:.2f}")
print(f"Overall Recall: {recall:.2f}")
print(f"Overall F1 Score: {f1:.2f}")

# Prepare results for saving
results_df = pd.DataFrame({
    'Model': ['MobileNetV2 (Video)'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Overall evaluation metrics saved to {results_file}")


No key frames found for video: 9. Blasphemy_Negative_Perception_Building.wav
Overall Accuracy: 52.21%
Overall Precision: 0.64
Overall Recall: 0.52
Overall F1 Score: 0.50
Overall evaluation metrics saved to /content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv


In [None]:
import pandas as pd
# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv')

# Display the first 5 rows
df.head()


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.5,0.375,0.5,0.412186
1,LSTM,0.392857,0.545635,0.392857,0.332512
2,Random Forest (Video),0.324013,0.364749,0.324013,0.289487
3,MobileNetV2 (Video),0.358025,0.344199,0.358025,0.319684


#5. Text Model Validation BERT

In [73]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import os

# Load your dataset
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')
text_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv')

# Tokenize the reviews using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model')

# Tokenize and encode the reviews
input_ids = []
attention_masks = []

for review in text_df['Transcription']:
    encoded_dict = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=64,
        truncation=True,  # Explicitly activate truncation
        padding='max_length',  # Pad to the max_length
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Load the labels
labels = torch.tensor(text_df['Senti'])

# Create the dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Load pre-trained BERT model for sentiment classification
model = BertForSequenceClassification.from_pretrained(
    '/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model',
    num_labels=3,  # 3 classes: neutral, positive, negative
    output_attentions=False,
    output_hidden_states=False
)

# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Evaluation loop
model.eval()
all_labels = []
all_preds = []

for batch in val_dataloader:
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2]
    }
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    preds = np.argmax(logits.cpu().numpy(), axis=1)

    all_labels.extend(inputs['labels'].cpu().numpy())
    all_preds.extend(preds)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Generate confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print('Confusion Matrix:')
print(conf_matrix)

# Print classification report
class_report = classification_report(all_labels, all_preds, target_names=['neutral', 'positive', 'negative'])
print('Classification Report:')
print(class_report)

# Prepare results for saving
class_report_dict = classification_report(all_labels, all_preds, target_names=['neutral', 'positive', 'negative'], output_dict=True)
precision = [class_report_dict[label]['precision'] for label in ['neutral', 'positive', 'negative']]
recall = [class_report_dict[label]['recall'] for label in ['neutral', 'positive', 'negative']]
f1_score = [class_report_dict[label]['f1-score'] for label in ['neutral', 'positive', 'negative']]

# Save the results to a CSV file
results_df = pd.DataFrame({
    'Model': ['BERT (Text)'],
    'Accuracy': [accuracy],
    'Precision': [np.mean(precision)],
    'Recall': [np.mean(recall)],
    'F1 Score': [np.mean(f1_score)]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Accuracy: 66.67%
Confusion Matrix:
[[1 0 0]
 [0 2 2]
 [0 0 1]]
Classification Report:
              precision    recall  f1-score   support

     neutral       1.00      1.00      1.00         1
    positive       1.00      0.50      0.67         4
    negative       0.33      1.00      0.50         1

    accuracy                           0.67         6
   macro avg       0.78      0.83      0.72         6
weighted avg       0.89      0.67      0.69         6

Results saved to /content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv


In [None]:
import pandas as pd
# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv')

# Display the first 5 rows
df.head()


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.5,0.375,0.5,0.412186
1,LSTM,0.392857,0.545635,0.392857,0.332512
2,Random Forest (Video),0.324013,0.364749,0.324013,0.289487
3,MobileNetV2 (Video),0.358025,0.344199,0.358025,0.319684
4,BERT (Text),0.666667,0.833333,0.666667,0.666667


#6. Text Model Validation BOW

In [74]:
import numpy as np
import pandas as pd
import re
import nltk
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import os

# Load your dataset
text_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv')

# Function to remove unwanted characters
def removing_unwanted_data(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text

# Apply preprocessing
text_df['text_cleaned'] = text_df['Transcription'].map(removing_unwanted_data)

# Custom tokenizer function
def custom_tokenizer(doc):
    return doc

# Load the saved vectorizer and model
vectorizer_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_bow_vectorizer.pkl'
model_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_logistic_regression_bow.pkl'

vectorizer = joblib.load(vectorizer_path)
model = joblib.load(model_path)

# Transform the data using the loaded vectorizer
X = vectorizer.transform(text_df['text_cleaned'])
y = text_df['Senti']

# Evaluate the loaded model
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
f1 = f1_score(y, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Generate confusion matrix
conf_matrix = confusion_matrix(y, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Prepare results for saving
results_df = pd.DataFrame({
    'Model': ['BOW Logistic Regression (Text)'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Accuracy: 58.62%
Precision: 0.60
Recall: 0.59
F1 Score: 0.58
Confusion Matrix:
[[0 1 1]
 [1 8 2]
 [0 7 9]]
Results saved to /content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv


In [75]:
import pandas as pd
# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation.csv')

# Display the first 5 rows
df.head(10)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.5,0.375,0.5,0.412186
1,LSTM,0.392857,0.545635,0.392857,0.332512
2,Random Forest (Video),0.561039,0.571987,0.561039,0.530425
3,MobileNetV2 (Video),0.522078,0.635805,0.522078,0.503028
4,BERT (Text),0.666667,0.777778,0.833333,0.722222
5,BOW Logistic Regression (Text),0.586207,0.603448,0.586207,0.579456


#Prediction labels by BOW

In [29]:
import numpy as np
import pandas as pd
import re
import nltk
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import os

# Load your dataset
text_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv')

# Function to remove unwanted characters
def removing_unwanted_data(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text

# Apply preprocessing
text_df['text_cleaned'] = text_df['Transcription'].map(removing_unwanted_data)

# Load the saved vectorizer and model
vectorizer_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_bow_vectorizer.pkl'
model_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_logistic_regression_bow.pkl'

vectorizer = joblib.load(vectorizer_path)
model = joblib.load(model_path)

# Transform the data using the loaded vectorizer
X = vectorizer.transform(text_df['text_cleaned'])
y = text_df['Senti']

# Evaluate the loaded model using the entire dataset for testing
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted', zero_division=1)
recall = recall_score(y, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Create a DataFrame to hold file names, actual labels, and predicted labels
results_df = pd.DataFrame({
    'Text File': text_df['File'],  # Assuming 'File' column has the audio file names
    'Actual Label': y,
    'Predicted Label': y_pred
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/prediction_label_by_bow.csv'

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print("Model evaluation metrics and results saved successfully.")
print(results_df)


Accuracy: 41.38%
Precision: 0.33
Recall: 0.41
F1 Score: 0.36
Model evaluation metrics and results saved successfully.
                                            Text File  Actual Label  \
0                               1. Mobile_Neutral.wav             0   
1              1. Mobile__Positive_Product_Review.wav             0   
2                  1. Tea_Negative_Product_Review.wav             2   
3      10. Blasphemy_Negative_Perception_Building.wav             2   
4   10. Political Extremisim_Negative_Perception_B...             2   
5      11. Blasphemy_Negative_Perception_Building.wav             2   
6   11. Political Extremisim_Negative_Perception_B...             2   
7    2. Communication_Neutral_Perception Building.wav             0   
8               2. Mobile_Positive_Product_Review.wav             1   
9                  2. Tea_Negative_Product_Review.wav             0   
10   3. Communication_Neutral_Perception Building.wav             0   
11                 3. ICE_Nega

Unnamed: 0,Text File,Actual Label,Predicted Label
0,1. Mobile_Neutral.wav,0,1
1,1. Mobile__Positive_Product_Review.wav,0,1
2,1. Tea_Negative_Product_Review.wav,2,2
3,10. Blasphemy_Negative_Perception_Building.wav,2,2
4,10. Political Extremisim_Negative_Perception_B...,2,2
5,11. Blasphemy_Negative_Perception_Building.wav,2,1
6,11. Political Extremisim_Negative_Perception_B...,2,2
7,2. Communication_Neutral_Perception Building.wav,0,1
8,2. Mobile_Positive_Product_Review.wav,1,1
9,2. Tea_Negative_Product_Review.wav,0,1


In [21]:
results_df.head()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Actual Labels,Predicted Labels
0,BOW Logistic Regression (Text),0.333333,0.166667,0.333333,0.222222,"[0, 0, 1, 0, 1, 0]","[2, 1, 1, 2, 1, 1]"


In [67]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import os

# Load your dataset
text_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv')

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model')

# Prepare input data
input_ids = []
attention_masks = []

for sentence in text_df['Transcription']:
    encoded_dict = tokenizer.encode_plus(
        sentence,                      # Sentence to encode.
        add_special_tokens = True,     # Add '[CLS]' and '[SEP]'
        max_length = 64,               # Pad & truncate all sentences.
        pad_to_max_length = True,
        return_attention_mask = True,  # Construct attention masks.
        return_tensors = 'pt',         # Return pytorch tensors.
    )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(text_df['Senti'].values)

# Create the TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create DataLoader
data_loader = DataLoader(dataset, batch_size=32, shuffle=False)

# Load BERT model for sequence classification.
model = BertForSequenceClassification.from_pretrained(
    '/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model',
    num_labels = 3,  # The number of output labels.
    output_attentions = False,  # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Prediction on the dataset
model.eval()  # Evaluation mode
all_labels = []
all_preds = []

for batch in data_loader:
    batch = tuple(t.to('cuda') if torch.cuda.is_available() else t.to('cpu') for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    labels = b_labels.cpu().numpy()

    all_labels.extend(labels)
    all_preds.extend(preds)

# Prepare DataFrame to save results
results_df = pd.DataFrame({
    'Text File': text_df['File'],  # Assuming 'File' column contains the names of the text files
    'Actual Label': all_labels,
    'Predicted Label': all_preds
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/prediction_label_by_BERT.csv'

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print("Model evaluation metrics and results saved successfully.")
print(results_df.head())


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Model evaluation metrics and results saved successfully.
                                           Text File  Actual Label  \
0                              1. Mobile_Neutral.wav             1   
1             1. Mobile__Positive_Product_Review.wav             1   
2                 1. Tea_Negative_Product_Review.wav             2   
3     10. Blasphemy_Negative_Perception_Building.wav             2   
4  10. Political Extremisim_Negative_Perception_B...             2   

   Predicted Label  
0                2  
1                1  
2                2  
3                0  
4                1  


In [68]:
results_df.head(30)

Unnamed: 0,Text File,Actual Label,Predicted Label
0,1. Mobile_Neutral.wav,1,2
1,1. Mobile__Positive_Product_Review.wav,1,1
2,1. Tea_Negative_Product_Review.wav,2,2
3,10. Blasphemy_Negative_Perception_Building.wav,2,0
4,10. Political Extremisim_Negative_Perception_B...,2,1
5,11. Blasphemy_Negative_Perception_Building.wav,1,2
6,11. Political Extremisim_Negative_Perception_B...,2,2
7,2. Communication_Neutral_Perception Building.wav,1,2
8,2. Mobile_Positive_Product_Review.wav,1,2
9,2. Tea_Negative_Product_Review.wav,0,1


In [38]:
import pandas as pd
import numpy as np
import cv2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skimage.feature import hog
import joblib
import os

# Load the CSV files
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')
frames_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Filter frames_df to only include key frames
frames_df = frames_df[frames_df['Key_Frame'] == 'Y']

# Function to preprocess images
def preprocess_image(filepath):
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return None
    img = cv2.imread(filepath)
    if img is None:
        print(f"Failed to read image: {filepath}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img, (128, 128))
    img = cv2.equalizeHist(img)
    return img

# Apply preprocessing to each image and prepare labels
images = []
labels = []
frame_paths = []  # List to keep track of frame paths for results
video_names = []  # List to store corresponding video names

for idx, row in frames_df.iterrows():
    video_name = row['Video Name'].replace('.wav', '')
    frame_path = row['Frame Path']
    # Check if the frame path contains the video name
    if video_name in frame_path:
        processed_img = preprocess_image(frame_path)
        if processed_img is not None:
            images.append(processed_img)
            labels.append(row['Senti'])
            frame_paths.append(frame_path)  # Store the frame path
            video_names.append(video_name)  # Store the video name without .wav extension

images = np.array(images)
labels = np.array(labels)

# Extract HOG features from the images
hog_features = [hog(image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys') for image in images]

# Load the saved model
joblib_file = "/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_2_random_forest_model.pkl"
loaded_rf_model = joblib.load(joblib_file)

# Evaluate the model
y_pred = loaded_rf_model.predict(hog_features)

# Prepare results for saving
results_df = pd.DataFrame({
    'Video Name': video_names,
    'Frame Path': frame_paths,
    'Actual Sentiment': labels,
    'Predicted Sentiment': y_pred
})

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/frame_pediction_video.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    results_df = pd.concat([existing_df, results_df], ignore_index=True)

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


Results saved to /content/drive/MyDrive/work2/Validation_Data/frame_pediction_video.csv


In [42]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/frame_pediction_video.csv')

# Display the first 5 rows
df.head(1000)


Unnamed: 0,Video Name,Frame Path,Actual Sentiment,Predicted Sentiment
0,1. Mobile_Neutral,/content/drive/MyDrive/work2/Validation_Data/V...,0,2
1,1. Mobile_Neutral,/content/drive/MyDrive/work2/Validation_Data/V...,0,2
2,1. Mobile_Neutral,/content/drive/MyDrive/work2/Validation_Data/V...,0,2
3,1. Mobile_Neutral,/content/drive/MyDrive/work2/Validation_Data/V...,0,2
4,1. Mobile_Neutral,/content/drive/MyDrive/work2/Validation_Data/V...,0,2
...,...,...,...,...
402,9. Blasphemy_Negative_Perception_Building (1),/content/drive/MyDrive/work2/Validation_Data/V...,0,0
403,9. Blasphemy_Negative_Perception_Building (1),/content/drive/MyDrive/work2/Validation_Data/V...,2,0
404,9. Blasphemy_Negative_Perception_Building (1),/content/drive/MyDrive/work2/Validation_Data/V...,2,0
405,9. Blasphemy_Negative_Perception_Building (1),/content/drive/MyDrive/work2/Validation_Data/V...,1,2


In [59]:
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
import os

# Load the CSV files
video_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')
frames_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Function to preprocess images
def preprocess_image(filepath):
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return None
    img = cv2.imread(filepath)
    if img is None:
        print(f"Failed to read image: {filepath}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))  # Resize to match MobileNetV2 input
    img = preprocess_input(img)  # Preprocess as per MobileNetV2 requirements
    return img

# Load the saved TensorFlow model
model_path = '/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_1_CNN_with_MobileNetV2.h5'
loaded_model = tf.keras.models.load_model(model_path)

# List to store results
results = []

# Process each video
for video_name in video_df['File']:
    video_base_name = video_name.replace('.wav', '')

    # Filter frames related to the current video and are key frames
    video_frames = frames_df[(frames_df['Video Name'].str.replace('.wav', '') == video_base_name) & (frames_df['Key_Frame'] == 'Y')]

    # Check if there are any key frames for this video
    if video_frames.empty:
        print(f"No key frames found for video: {video_name}")
        continue

    # Apply preprocessing to each image and prepare features
    for frame_path, true_label in zip(video_frames['Frame Path'], video_frames['Senti']):
        processed_img = preprocess_image(frame_path)
        if processed_img is not None:
            video_image = np.expand_dims(processed_img, axis=0)
            video_pred = loaded_model.predict(video_image)[0]
            predicted_label = np.argmax(video_pred)
            results.append({
                'Text File': frame_path,
                'Actual Label': true_label,
                'Predicted Label': predicted_label
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Path to the results file
results_file = '/content/drive/MyDrive/work2/Validation_Data/Frame_Prediction_Results_CNN.csv'

# Save the DataFrame to the CSV file
results_df.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


No key frames found for video: 9. Blasphemy_Negative_Perception_Building.wav
Results saved to /content/drive/MyDrive/work2/Validation_Data/Frame_Prediction_Results_CNN.csv


# **Table Creation for Sentiment Classfication based on 1 Video & its corresponding modalities (Audio,Text, Frame)**

##Snippet 1: Audio Model 1 (LSTM)

In [6]:
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
import joblib

# Load your datasets
audio_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv')

# Get the first audio file
first_audio_file = audio_df.iloc[0]['File']

# Load the pre-trained model and scaler
audio_model_path = '/content/drive/MyDrive/work2/Final/Old_audio_models/Audio_Model_1_LSTM.h5'
audio_scaler_path = '/content/drive/MyDrive/work2/Final/Old_audio_models/scaler_lstm.pkl'
audio_model = tf.keras.models.load_model(audio_model_path)
audio_scaler = joblib.load(audio_scaler_path)

# Helper function for audio feature extraction
def preprocess_audio(file_path):
    y, sr = librosa.load(file_path, duration=3)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1)
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)
    bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr), axis=1)
    energy_contour = np.mean(librosa.feature.rms(y=y), axis=1)
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr), axis=1)
    return np.concatenate((mfccs, mel, contrast, bandwidth, energy_contour, spectral_rolloff))

# Predict audio sentiment
audio_path = '/content/drive/MyDrive/work2/Validation_Data/Audios/' + first_audio_file
audio_features = preprocess_audio(audio_path)
audio_features_scaled = audio_scaler.transform([audio_features])
audio_features_scaled = np.expand_dims(audio_features_scaled, -1)
audio_pred = audio_model.predict(audio_features_scaled)[0]
audio_sentiment = np.argmax(audio_pred)
audio_percentage = audio_pred[audio_sentiment]

# Save results to a DataFrame
results_df_audio1 = pd.DataFrame({
    'Modality': ['Audio Model 1'],
    'Sentiment': ['neutral' if audio_sentiment == 0 else 'positive' if audio_sentiment == 1 else 'negative'],
    'Percentage': [audio_percentage]
})

# Save to CSV for combining later
results_df_audio1.to_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_audio1.csv', index=False)




##Snippet 2: Audio Model 2 (Random Forest)

In [7]:
import numpy as np
import pandas as pd
import librosa
import joblib

# Load your datasets
audio_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Audio.csv')

# Get the first audio file
first_audio_file = audio_df.iloc[0]['File']

# Load the pre-trained model and scaler
audio_model_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest.joblib'
audio_scaler_path = '/content/drive/MyDrive/work2/Final/Audio_Models/Audio_Model_2_RandomForest_Scaler.joblib'
audio_model = joblib.load(audio_model_path)
audio_scaler = joblib.load(audio_scaler_path)

# Helper function for audio feature extraction
def preprocess_audio(file_path):
    y, sr = librosa.load(file_path, duration=3)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
    return np.concatenate((mfccs, chroma, mel, spectral_contrast, tonnetz))

# Predict audio sentiment
audio_path = '/content/drive/MyDrive/work2/Validation_Data/Audios/' + first_audio_file
audio_features = preprocess_audio(audio_path)

# Ensure the feature vector has the correct shape
if audio_features.shape[0] == 151:
    audio_features = np.pad(audio_features, (0, 15), 'constant')  # Assuming 15 features are missing
elif audio_features.shape[0] == 166:
    audio_features = audio_features[:166]  # Assuming extra features need to be trimmed
else:
    raise ValueError(f"Unexpected number of features: {audio_features.shape[0]}")

audio_features_scaled = audio_scaler.transform([audio_features])
audio_pred = audio_model.predict_proba(audio_features_scaled)[0]
audio_sentiment = np.argmax(audio_pred)
audio_percentage = audio_pred[audio_sentiment]

# Save results to a DataFrame
results_df_audio2 = pd.DataFrame({
    'Modality': ['Audio Model 2'],
    'Sentiment': ['neutral' if audio_sentiment == 0 else 'positive' if audio_sentiment == 1 else 'negative'],
    'Percentage': [audio_percentage]
})

# Save to CSV for combining later
results_df_audio2.to_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_audio2.csv', index=False)

print(results_df_audio2)


        Modality Sentiment  Percentage
0  Audio Model 2  negative        0.44


##Snippet 3: Text Model 1 (BERT)

In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import pandas as pd

# Load your datasets
text_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv')

# Get the first text transcription
first_text = text_df.iloc[0]['Transcription']

# Load the pre-trained model and tokenizer
text_model_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_model_1_BERT_Model'
text_tokenizer = BertTokenizer.from_pretrained(text_model_path)
text_model = BertForSequenceClassification.from_pretrained(text_model_path)

# Tokenize and encode the text
encoded_dict = text_tokenizer.encode_plus(
    first_text,
    add_special_tokens=True,
    max_length=64,
    truncation=True,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'
)
input_ids = encoded_dict['input_ids']
attention_mask = encoded_dict['attention_mask']

# Predict text sentiment
text_model.eval()
with torch.no_grad():
    outputs = text_model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1).detach().cpu().numpy()[0]

text_sentiment = np.argmax(probs)
text_percentage = probs[text_sentiment]

# Save results to a DataFrame
results_df_text1 = pd.DataFrame({
    'Modality': ['Text Model 1'],
    'Sentiment': ['neutral' if text_sentiment == 0 else 'positive' if text_sentiment == 1 else 'negative'],
    'Percentage': [text_percentage]
})

# Save to CSV for combining later
results_df_text1.to_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_text1.csv', index=False)


##Snippet 4: Text Model 2 (Logistic Regression with BOW)

In [13]:
import numpy as np
import pandas as pd
import re
import nltk
import joblib
from sklearn.feature_extraction.text import CountVectorizer

# Load your datasets
text_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Text.csv')

# Get the first text transcription
first_text = text_df.iloc[0]['Transcription']

# Function to remove unwanted characters
def removing_unwanted_data(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text

# Custom tokenizer function (as it was used when creating the vectorizer)
def custom_tokenizer(doc):
    return doc

# Apply preprocessing
cleaned_text = removing_unwanted_data(first_text)

# Load the saved vectorizer and model
vectorizer_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_bow_vectorizer.pkl'
model_path = '/content/drive/MyDrive/work2/Final/Text_Work/Text_Models_2_and_3/Text_model_2_logistic_regression_bow.pkl'

# Ensure the custom tokenizer is available when loading the vectorizer
vectorizer = joblib.load(vectorizer_path)
model = joblib.load(model_path)

# Transform the data using the loaded vectorizer
X = vectorizer.transform([' '.join(cleaned_text)])

# Predict text sentiment
text_pred = model.predict_proba(X)[0]
text_sentiment = np.argmax(text_pred)
text_percentage = text_pred[text_sentiment]

# Save results to a DataFrame
results_df_text2 = pd.DataFrame({
    'Modality': ['Text Model 2'],
    'Sentiment': ['neutral' if text_sentiment == 0 else 'positive' if text_sentiment == 1 else 'negative'],
    'Percentage': [text_percentage]
})

# Save to CSV for combining later
results_df_text2.to_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_text2.csv', index=False)

print("Results saved successfully for Text Model 2.")


Results saved successfully for Text Model 2.


##Snippet 5: Video Model 1 (CNN with MobileNetV2)

In [16]:
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

# Load your datasets
frames_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Get the first video name without the .wav extension
first_video_name = frames_df.iloc[0]['Video Name'].replace('.wav', '')

# Load the pre-trained model
video_model_path = '/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_1_CNN_with_MobileNetV2.h5'
video_model = tf.keras.models.load_model(video_model_path)

# Helper function for video feature extraction
def preprocess_image(filepath):
    img = cv2.imread(filepath)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = preprocess_input(img)
    return img

# Filter frames related to the first video and are key frames
video_frames = frames_df[(frames_df['Video Name'].str.replace('.wav', '') == first_video_name) & (frames_df['Key_Frame'] == 'Y')]

# Predict video sentiment for all key frames
predictions = []

for frame_path in video_frames['Frame Path']:
    video_image = preprocess_image(frame_path)
    video_image = np.expand_dims(video_image, axis=0)
    video_pred = video_model.predict(video_image)[0]
    predictions.append(video_pred)

# Aggregate predictions
if predictions:
    average_prediction = np.mean(predictions, axis=0)
    video_sentiment = np.argmax(average_prediction)
    video_percentage = average_prediction[video_sentiment]

    # Save results to a DataFrame
    results_df_video1 = pd.DataFrame({
        'Modality': ['Video Model 1'],
        'Sentiment': ['neutral' if video_sentiment == 0 else 'positive' if video_sentiment == 1 else 'negative'],
        'Percentage': [video_percentage]
    })

    # Save to CSV for combining later
    results_df_video1.to_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_video1.csv', index=False)

    print("Results saved successfully for Video Model 1.")
else:
    print("No key frames found for the first video.")


Results saved successfully for Video Model 1.


##Snippet 6: Video Model 2 (Random Forest)

In [17]:
import numpy as np
import pandas as pd
import cv2
import joblib
from skimage.feature import hog

# Load your datasets
frames_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Frames_Meta_Data.csv')

# Get the first video name without the .wav extension
first_video_name = frames_df.iloc[0]['Video Name'].replace('.wav', '')

# Load the pre-trained model
video_model_path = '/content/drive/MyDrive/work2/Final/Image_Models/Video_Model_2_random_forest_model.pkl'
video_model = joblib.load(video_model_path)

# Helper function for video feature extraction
def preprocess_image(filepath):
    img = cv2.imread(filepath)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img, (128, 128))
    img = cv2.equalizeHist(img)
    hog_features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys')
    return hog_features

# Filter frames related to the first video and are key frames
video_frames = frames_df[(frames_df['Video Name'].str.replace('.wav', '') == first_video_name) & (frames_df['Key_Frame'] == 'Y')]

# Predict video sentiment for all key frames
predictions = []

for frame_path in video_frames['Frame Path']:
    video_features = preprocess_image(frame_path).reshape(1, -1)
    video_pred = video_model.predict_proba(video_features)[0]
    predictions.append(video_pred)

# Aggregate predictions
if predictions:
    average_prediction = np.mean(predictions, axis=0)
    video_sentiment = np.argmax(average_prediction)
    video_percentage = average_prediction[video_sentiment]

    # Save results to a DataFrame
    results_df_video2 = pd.DataFrame({
        'Modality': ['Video Model 2'],
        'Sentiment': ['neutral' if video_sentiment == 0 else 'positive' if video_sentiment == 1 else 'negative'],
        'Percentage': [video_percentage]
    })

    # Save to CSV for combining later
    results_df_video2.to_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_video2.csv', index=False)

    print("Results saved successfully for Video Model 2.")
else:
    print("No key frames found for the first video.")


Results saved successfully for Video Model 2.


## Saving Results

In [19]:
import pandas as pd
import os
# Load the temporary results from each model
results_audio1 = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_audio1.csv')
results_audio2 = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_audio2.csv')
results_text1 = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_text1.csv')
results_text2 = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_text2.csv')
results_video1 = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_video1.csv')
results_video2 = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Temp_results_video2.csv')

# Combine all results into one DataFrame
all_results = pd.concat([results_audio1, results_audio2, results_text1, results_text2, results_video1, results_video2])

# Determine the modality with the greatest percentage
max_percentage_idx = all_results['Percentage'].idxmax()
max_percentage_modality = all_results.loc[max_percentage_idx, 'Modality']
max_percentage_sentiment = all_results.loc[max_percentage_idx, 'Sentiment']

# Add considered sentiment to the results DataFrame
all_results['Considered Sentiment'] = None
all_results.at[max_percentage_idx, 'Considered Sentiment'] = max_percentage_sentiment

# Display the results table
print(all_results)

# Save the final results to the validation CSV file
results_file = '/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation_modelity.csv'

# Check if the results file exists
if os.path.exists(results_file):
    # If it exists, read the existing data
    existing_df = pd.read_csv(results_file)
    # Append the new results
    all_results = pd.concat([existing_df, all_results], ignore_index=True)

# Save the DataFrame to the CSV file
all_results.to_csv(results_file, index=False)

print(f"Results saved to {results_file}")


        Modality Sentiment  Percentage Considered Sentiment
0  Audio Model 1  positive    0.428967             positive
0  Audio Model 2  negative    0.440000             negative
0   Text Model 1  negative    0.955447             negative
0   Text Model 2  negative    0.999999             negative
0  Video Model 1  positive    0.470676             positive
0  Video Model 2  negative    0.365104             negative
Results saved to /content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation_modelity.csv


In [43]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/work2/Validation_Data/Final_Model_Validation_modelity.csv")

# Display the first 10 rows of the DataFrame
display(df.head(10))


Unnamed: 0,Modality,Sentiment,Percentage,Considered Sentiment
0,Audio Model 1,positive,0.428967,positive
1,Audio Model 2,negative,0.44,negative
2,Text Model 1,negative,0.955447,negative
3,Text Model 2,negative,0.999999,negative
4,Video Model 1,positive,0.470676,positive
5,Video Model 2,negative,0.365104,negative


#Modelity Work

In [44]:
import pandas as pd
import numpy as np

# Specific predictions data
data = {
    'Modality': [
        'Audio Model 1', 'Audio Model 2', 'Text Model 1', 'Text Model 2', 'Video Model 1', 'Video Model 2'
    ],
    'Sentiment': [
        'positive', 'negative', 'negative', 'negative', 'positive', 'negative'
    ],
    'Percentage': [
        0.428967, 0.440000, 0.955447, 0.999999, 0.470676, 0.365104
    ],
    'Considered Sentiment': [
        'positive', 'negative', 'negative', 'negative', 'positive', 'negative'
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Weights for each modality (adjust these weights as needed)
modality_weights = {
    'Audio Model 1': 1.0,
    'Audio Model 2': 1.0,
    'Text Model 1': 1.0,
    'Text Model 2': 1.0,
    'Video Model 1': 1.0,
    'Video Model 2': 1.0
}

# Apply weights to each prediction percentage
df['Weighted Percentage'] = df.apply(lambda row: row['Percentage'] * modality_weights[row['Modality']], axis=1)

# Aggregate the sentiments with their weighted percentages
sentiment_scores = df.groupby('Sentiment')['Weighted Percentage'].sum()

# Determine the final sentiment
final_sentiment = sentiment_scores.idxmax()
final_percentage = sentiment_scores.max()

# Print the DataFrame
print(df)

# Load the Videos.csv file to get video names and labels
videos_df = pd.read_csv('/content/drive/MyDrive/work2/Validation_Data/Meta_Data/Videos.csv')

# Assume we are working with the first video
video_name = videos_df.iloc[0]['File']
video_label = videos_df.iloc[0]['Senti']

# Print and save the final classification
final_classification = {
    'Video Name': [video_name],
    'True Label': [video_label],
    'Predicted Sentiment': [final_sentiment],
    'Final Percentage': [final_percentage]
}

final_classification_df = pd.DataFrame(final_classification)

# Save the final classification to a CSV file
final_classification_output_path = '/content/drive/MyDrive/work2/Validation_Data/Final_Classification_Results.csv'
final_classification_df.to_csv(final_classification_output_path, index=False)

print("Final classification results saved successfully.")
print(final_classification_df)


        Modality Sentiment  Percentage Considered Sentiment  \
0  Audio Model 1  positive    0.428967             positive   
1  Audio Model 2  negative    0.440000             negative   
2   Text Model 1  negative    0.955447             negative   
3   Text Model 2  negative    0.999999             negative   
4  Video Model 1  positive    0.470676             positive   
5  Video Model 2  negative    0.365104             negative   

   Weighted Percentage  
0             0.428967  
1             0.440000  
2             0.955447  
3             0.999999  
4             0.470676  
5             0.365104  
Final classification results saved successfully.
              Video Name  True Label Predicted Sentiment  Final Percentage
0  1. Mobile_Neutral.wav           0            negative           2.76055


In [45]:
final_classification_df.head()

Unnamed: 0,Video Name,True Label,Predicted Sentiment,Final Percentage
0,1. Mobile_Neutral.wav,0,negative,2.76055
