In [1]:


from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install SpeechRecognition
!pip install google-cloud-speech pandas

In [1]:
import os
import pandas as pd
import numpy as np
import cv2
import librosa
import tensorflow as tf
import joblib
from sklearn.preprocessing import LabelEncoder
from skimage.feature import hog
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import torch
import speech_recognition as sr

# Paths to the models and scalers
bert_model_path = '/content/drive/MyDrive/work2/Final/Text_Work/BERTModel'
video_model1_path = '/content/drive/MyDrive/work2/Final/Image_Models/sentiment_model.h5'
video_model2_path = "/content/drive/MyDrive/work2/Final/Image_Models/random_forest_model.pkl"
audio_model1_path = '/content/drive/MyDrive/work2/Final/best_audio_model/best_model.h5'
audio_model2_path = '/content/drive/MyDrive/work2/Final/audio_models/Audio_sentiment.h5'
scaler_path = '/content/drive/MyDrive/work2/Final/audio_models/scaler.pkl'

# Output paths
output_csv_path = '/content/drive/MyDrive/Use_Case/transcription/test3_UseCases.csv'
frames_csv_path = '/content/drive/MyDrive/Use_Case/frames_path/Test3_frames_info.csv'

# Function to preprocess images for MobileNetV2
def preprocess_image_mobilenet(filepath):
    img = cv2.imread(filepath)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
    return img

# Function to preprocess images for RandomForest
def preprocess_image_rf(filepath):
    img = cv2.imread(filepath)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img, (48, 48))
    img = cv2.equalizeHist(img)
    return img

# Function to extract features from audio files
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=3)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    features = np.mean(np.concatenate((mfccs, chroma, mel, contrast), axis=0), axis=1)
    return features

# Function to transcribe audio using Google Speech Recognition
def transcribe_audio(audio_path, language="ur-PK"):
    recognizer = sr.Recognizer()
    audio_file = sr.AudioFile(audio_path)
    with audio_file as source:
        audio_data = recognizer.record(source)
    try:
        transcript = recognizer.recognize_google(audio_data, language=language)
        return transcript
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError as e:
        return f"Could not request results from Google Speech Recognition service; {e}"

# Function to perform sentiment prediction using BERT
def predict_text_sentiment(transcript):
    tokenizer = BertTokenizer.from_pretrained(bert_model_path)
    model = BertForSequenceClassification.from_pretrained(bert_model_path)
    encoded_dict = tokenizer.encode_plus(
        transcript, add_special_tokens=True, max_length=64, truncation=True,
        padding='max_length', return_attention_mask=True, return_tensors='pt'
    )
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=1)
    model.eval()
    predictions = []
    for batch in dataloader:
        batch = tuple(t.to('cpu') for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    predicted_label = np.argmax(predictions, axis=2).flatten()
    label_encoder = LabelEncoder()
    label_encoder.fit(['Neutral', 'Positive', 'Negative'])
    predicted_sentiment = label_encoder.inverse_transform(predicted_label)
    return predicted_sentiment[0]

# Function to predict sentiment using MobileNetV2
def predict_video_sentiment_mobilenet(image_path):
    model = tf.keras.models.load_model(video_model1_path)
    img = preprocess_image_mobilenet(image_path)
    img = np.expand_dims(img, axis=0)  # Expand dimensions to match input shape
    prediction = model.predict(img)
    predicted_label = np.argmax(prediction, axis=1)[0]
    return predicted_label

# Function to predict sentiment using RandomForest
def predict_video_sentiment_rf(image_path):
    model = joblib.load(video_model2_path)
    img = preprocess_image_rf(image_path)
    hog_feature = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys')
    prediction = model.predict([hog_feature])
    return prediction[0]

# Function to predict sentiment using LSTM
def predict_audio_sentiment_lstm(features):
    model = tf.keras.models.load_model(audio_model1_path)
    scaler = joblib.load(scaler_path)
    scaled_features = scaler.transform([features])
    reshaped_features = np.expand_dims(scaled_features, -1)
    prediction = model.predict(reshaped_features)
    predicted_label = np.argmax(prediction, axis=1)
    label_encoder = LabelEncoder()
    label_encoder.fit(['Neutral', 'Positive', 'Negative'])
    predicted_sentiment = label_encoder.inverse_transform(predicted_label)
    return predicted_sentiment[0]

# Function to predict sentiment using another audio model
def predict_audio_sentiment_other(features):
    model = tf.keras.models.load_model(audio_model2_path)
    scaler = joblib.load(scaler_path)
    scaled_features = scaler.transform([features])
    reshaped_features = np.expand_dims(scaled_features, -1)
    prediction = model.predict(reshaped_features)
    predicted_label = np.argmax(prediction, axis=1)
    label_encoder = LabelEncoder()
    label_encoder.fit(['Neutral', 'Positive', 'Negative'])
    predicted_sentiment = label_encoder.inverse_transform(predicted_label)
    return predicted_sentiment[0]

# Main function to perform multimodal sentiment analysis
def multimodal_sentiment_analysis(video_file_path):
    video_name = os.path.basename(video_file_path).split('.')[0]
    audio_file_path = f'/content/drive/MyDrive/Use_Case/test_converted_audio/{video_name}.wav'
    output_folder = f'/content/drive/MyDrive/Use_Case/Frames/{video_name}'
    os.makedirs(output_folder, exist_ok=True)

    # Extract frames from video and perform predictions on-the-fly to save memory
    cap = cv2.VideoCapture(video_file_path)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    video_sentiments_mobilenet = []
    video_sentiments_rf = []

    success, frame = cap.read()
    while success:
        frame_path = os.path.join(output_folder, f'{video_name}_frame.jpg')
        cv2.imwrite(frame_path, frame)

        # Perform predictions
        video_sentiments_mobilenet.append(predict_video_sentiment_mobilenet(frame_path))
        video_sentiments_rf.append(predict_video_sentiment_rf(frame_path))

        success, frame = cap.read()
    cap.release()

    # Determine overall video sentiment
    video_sentiment_mobilenet = pd.Series(video_sentiments_mobilenet).value_counts().idxmax()
    video_sentiment_rf = pd.Series(video_sentiments_rf).value_counts().idxmax()

    video_sentiment_mobilenet = ["Neutral", "Positive", "Negative"][video_sentiment_mobilenet]
    video_sentiment_rf = ["Neutral", "Positive", "Negative"][video_sentiment_rf]

    # Transcribe audio
    transcript = transcribe_audio(audio_file_path)

    # Extract audio features
    audio_features = extract_features(audio_file_path)

    # Perform predictions
    text_sentiment = predict_text_sentiment(transcript)
    audio_sentiment_lstm = predict_audio_sentiment_lstm(audio_features)
    audio_sentiment_other = predict_audio_sentiment_other(audio_features)

    # Prepare the output DataFrame
    df_main = pd.DataFrame(columns=['file', 'modality', 'senti', 'Model1', 'Model2', 'Model3'])

    df_main = df_main.append({'file': video_name, 'modality': 'T', 'senti': text_sentiment, 'Model1': text_sentiment, 'Model2': '', 'Model3': ''}, ignore_index=True)
    df_main = df_main.append({'file': video_name, 'modality': 'V', 'senti': '', 'Model1': video_sentiment_mobilenet, 'Model2': video_sentiment_rf, 'Model3': ''}, ignore_index=True)
    df_main = df_main.append({'file': video_name, 'modality': 'A', 'senti': '', 'Model1': audio_sentiment_lstm, 'Model2': audio_sentiment_other, 'Model3': ''}, ignore_index=True)

    # Save the final report to CSV
    df_main.to_csv(output_csv_path, index=False)
    print(f"Multimodal sentiment analysis report saved to {output_csv_path}")

# Example usage
video_file_path = '/content/drive/MyDrive/Use_Case/test_video/Test.mp4'
multimodal_sentiment_analysis(video_file_path)












AttributeError: 'DataFrame' object has no attribute 'append'