In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import random
import os
import nltk
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/saahil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saahil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# train_df = pd.read_csv('train_emotion.csv')

train_df = pd.read_csv("train_emotion.csv",encoding="latin-1")
# Define path to video clips
video_dir = 'train_data'


# Function to get video file path from IDs
def get_video_clip_path(row):
    # print(row)
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Apply the function to get file paths for each sampled clip
train_df['video_clip_path'] = train_df.apply(get_video_clip_path, axis=1)

print(train_df['Emotion'].unique())
# Check sample paths
# print(train_df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())
# print(train_df.head())


['neutral' 'joy' 'anger' 'surprise' 'sadness']


In [4]:
train_df.shape

(1000, 11)

In [5]:
#preprocessing:

#drop null values
train_df.dropna(inplace=True)

#extract emotion, map them to the number of train instances.
emotion = train_df['Emotion'].unique()
for e in emotion:
    n = train_df[train_df['Emotion'] == e].shape[0]
    print(f'{e} : {n}')


#stem utterance, store in col "Utterance_stemmed"
stemmer = nltk.stem.PorterStemmer()

def stem_text(text):
    if isinstance(text,str):
        words = nltk.word_tokenize(text)
        filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
        return ' '.join(stemmer.stem(word) for word in filtered_words)

train_df['Utterance_stemmed'] = train_df['Utterance'].astype(str).apply(stem_text)

# train_df['Utterance_stemmed'].head()

#Vectorise the text using Word2Vec
def wtov():
    train_df['Utterance_tokenized'] = train_df['Utterance_stemmed'].apply(lambda x: word_tokenize(x.lower()))
    model = Word2Vec(sentences=train_df['Utterance_tokenized'],vector_size=500,window=5,min_count=1,sg=0)
    return model
vectoriser_model = wtov()

# print("Vocab: ",model.wv.key_to_index)


vectoriser_model.save("WtoV_from_train.model")


# new_instance = "I'm excited for this opportunity!"
# stop_words = set(nltk.corpus.stopwords.words('english'))
# stemmer = nltk.stem.PorterStemmer()

# vector = get_vector_for_instance(new_instance, model)
# print("Vector representation:", vector)



neutral : 500
joy : 184
anger : 117
surprise : 127
sadness : 72


In [6]:
#Classifying logic
#perform undersampling for emotion
#m is the min count of the emotions
# emotion_counts = train_df['Emotion'].value_counts()
# m = emotion_counts.min()

# # Step 2: Perform undersampling for each emotion class
# undersampled_dfs = []
# for emotion in emotion_counts.index:
#     emotion_df = train_df[train_df['Emotion'] == emotion]
#     undersampled_df = emotion_df.sample(n=m, random_state=42)  # Randomly sample `m` instances
#     undersampled_dfs.append(undersampled_df)

# # Step 3: Concatenate all undersampled dataframes
# undersampled_train_df = pd.concat(undersampled_dfs).reset_index(drop=True)

# # Use the undersampled data as the new training set
# X = undersampled_train_df[["Utterance_tokenized"]]
# y = undersampled_train_df[["Emotion"]]

X = train_df[["Utterance_tokenized","Sr No."]]
y = train_df[["Emotion"]]


X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get the vector representation for the input instance (tokenized text)
def get_vector_for_instance(text, model, vector_size=500):
    vec = np.zeros(vector_size)
    count = 0
    for word in text:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

# Find top 5 similar instances based on cosine similarity
# def find_top10_similar_instances(input_text, model, vector_size=500):
#     # Convert input text (tokenized) to vector representation
#     input_vector = get_vector_for_instance(input_text, model, vector_size)
    
#     # Convert each tokenized utterance in train_df to its vector representation
#     train_vectors = train_df['Utterance_tokenized'].apply(lambda x: get_vector_for_instance(x, model, vector_size))
    
#     # Calculate cosine similarity between input_vector and each instance in the train set
#     similarities = train_vectors.apply(lambda x: cosine_similarity([input_vector], [x])[0][0])
    
#     # Get the top 5 most similar instances
#     top5_indices = similarities.nlargest(7).index
#     return train_df.iloc[top5_indices][['Emotion']]  # Adjust columns as needed
def find_top10_similar_instances(input_text, model, vector_size=500):
    # Convert input text (tokenized) to vector representation
    input_vector = get_vector_for_instance(input_text, model, vector_size)
    
    # Convert each tokenized utterance in train_df to its vector representation
    train_vectors = train_df['Utterance_tokenized'].apply(lambda x: get_vector_for_instance(x, model, vector_size))
    
    # Calculate cosine similarity between input_vector and each instance in the train set
    similarities = train_vectors.apply(lambda x: cosine_similarity([input_vector], [x])[0][0])
    
    # Get the top 10 most similar instances
    top10_indices = similarities.nlargest(10).index
    top10_similarities = similarities.nlargest(10).values
    
    # Create a DataFrame with top 10 similar instances and their similarities
    top10_df = train_df.iloc[top10_indices][['Emotion']].copy()
    top10_df['Similarity'] = top10_similarities
    
    return top10_df  # Return both the emotion and similarity score


y_pred = []
# Test the function with test data
for instance in X_test['Utterance_tokenized']:
    top10_similar = find_top10_similar_instances(instance, vectoriser_model)
    emotions_dict = {}

    # Weighted summing of similarities for each emotion
    for idx, row in top10_similar.iterrows():
        emotion = row['Emotion']
        similarity = row['Similarity']
        if emotion not in emotions_dict:
            emotions_dict[emotion] = 0
        emotions_dict[emotion] += similarity

    # Find the emotion with the highest weighted similarity
    predicted_emotion = max(emotions_dict, key=emotions_dict.get)
    y_pred.append(predicted_emotion)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6450
Classification Report:
              precision    recall  f1-score   support

       anger       0.80      0.16      0.27        25
         joy       0.56      0.29      0.38        34
     neutral       0.64      0.94      0.76       108
     sadness       0.60      0.23      0.33        13
    surprise       0.79      0.55      0.65        20

    accuracy                           0.65       200
   macro avg       0.68      0.43      0.48       200
weighted avg       0.66      0.65      0.60       200

Confusion Matrix:
[[  4   2  18   0   1]
 [  0  10  23   1   0]
 [  1   4 101   1   1]
 [  0   0   9   3   1]
 [  0   2   7   0  11]]


In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
import cv2
import numpy as np
import os
from tqdm import tqdm
import pandas as pd
from tensorflow.keras.mixed_precision import set_global_policy


# Enable mixed precision for better performance on Apple silicon
set_global_policy('mixed_float16')


def create_emotion_model(input_shape=(48, 48, 1), num_classes=7):
    model = Sequential()
    model.add(Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(256, (3, 3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(num_classes, activation='softmax', dtype='float32'))
    return model

def extract_frames_from_video(video_path, target_size=(48, 48)):
    frames = []
    if not os.path.exists(video_path):
        print(f"Warning: Video file not found: {video_path}")
        return np.array(frames)
        
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    with tqdm(total=total_frames, desc=f"Processing {os.path.basename(video_path)}") as pbar:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
                
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
            faces = face_cascade.detectMultiScale(gray, 1.1, 4)
            
            if len(faces) > 0:
                face = max(faces, key=lambda x: x[2] * x[3])
                x, y, w, h = face
                face_roi = gray[y:y+h, x:x+w]
                face_roi = cv2.resize(face_roi, target_size)
                face_roi = face_roi.astype('float16') / 255.0
                frames.append(face_roi)
            pbar.update(1)
    
    cap.release()
    return np.array(frames)

def prepare_training_data(df):
    X, y = [], []
    emotion_mapping = {'neutral': 0, 'anger': 1, 'joy': 2, 'sadness': 3, 'surprise': 4}
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing videos"):
        video_path = row['video_clip_path']
        emotion = row['Emotion']
        
        frames = extract_frames_from_video(video_path)
        if len(frames) > 0:
            X.extend(frames)
            y.extend([emotion_mapping[emotion]] * len(frames))
    
    X = np.array(X, dtype='float16')
    X = np.expand_dims(X, axis=-1)
    y = tf.keras.utils.to_categorical(y, num_classes=7)
    
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(buffer_size=len(X)).batch(32).prefetch(tf.data.AUTOTUNE)
    
    return dataset

def train_model(dataset, epochs=50):
    model = create_emotion_model()
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
        tf.keras.callbacks.ModelCheckpoint('emotion_classification_model.h5', monitor='val_accuracy', save_best_only=True),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-7)
    ]
    
    print("Starting training...")
    history = model.fit(dataset, epochs=epochs, validation_data=dataset, callbacks=callbacks)
    return model, history

def extract_features(video_path, model, batch_size=32):
    frames = extract_frames_from_video(video_path)
    if len(frames) == 0:
        return None
    frames = np.expand_dims(frames, axis=-1)
    
    batch_predictions = []
    for i in range(0, len(frames), batch_size):
        batch = frames[i:i+batch_size]
        predictions = model.predict(batch, verbose=0)
        batch_predictions.append(predictions)
    
    predictions = np.concatenate(batch_predictions, axis=0)
    avg_predictions = np.mean(predictions, axis=0)
    
    return avg_predictions

def process_dataframe(df, model, output_path='emotion_features.csv'):
    emotion_labels = ['neutral', 'anger', 'joy', 'sadness', 'surprise']
    features_list = []
    
    print("\nExtracting features from videos...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        video_path = row['video_clip_path']
        features = extract_features(video_path, model)
        
        if features is not None:
            feature_dict = row.to_dict()
            for emotion, value in zip(emotion_labels, features):
                feature_dict[f'emotion_{emotion}'] = value
            features_list.append(feature_dict)
    
    features_df = pd.DataFrame(features_list)
    features_df.to_csv(output_path, index=False)
    print(f"\nFeatures saved to {output_path}")
    
    return features_df

if __name__ == "__main__":
    # Load your DataFrame here, assuming it has 'video_clip_path' and 'Emotion' columns
    df = train_df
    
    try:
        if os.path.exists('emotion_classification_model.h5'):
            print("Loading pre-trained model...")
            model = create_emotion_model()
            model.load_weights('emotion_classification_model.h5')
        else:
            print("No pre-trained weights found. Starting training...")
            dataset = prepare_training_data(df)
            model, history = train_model(dataset, epochs=3)
            
            # Plot training history
            import matplotlib.pyplot as plt
            plt.figure(figsize=(12, 4))
            plt.subplot(1, 2, 1)
            plt.plot(history.history['accuracy'], label='Training Accuracy')
            plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
            plt.title('Model Accuracy')
            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.legend()
            
            plt.subplot(1, 2, 2)
            plt.plot(history.history['loss'], label='Training Loss')
            plt.plot(history.history['val_loss'], label='Validation Loss')
            plt.title('Model Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.legend()
            
            plt.tight_layout()
            plt.savefig('training_history.png')
            plt.close()
        
        # Process videos and extract features
        print("\nStarting feature extraction...")
        features_df = process_dataframe(df, model)
        
        # Print summary statistics
        print("\nFeature Extraction Summary:")
        print(f"Total videos processed: {len(features_df)}")
        emotion_columns = [col for col in features_df.columns if col.startswith('emotion_')]
        for col in emotion_columns:
            print(f"{col.replace('emotion_', '')}: {features_df[col].mean():.3f}")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()


No pre-trained weights found. Starting training...


Processing dia0_utt7.mp4: 100%|██████████| 137/137 [00:08<00:00, 15.84it/s]
Processing dia0_utt11.mp4: 100%|██████████| 199/199 [00:16<00:00, 12.04it/s]
Processing dia2_utt8.mp4: 100%|██████████| 16/16 [00:01<00:00, 11.51it/s]
Processing dia3_utt3.mp4: 100%|██████████| 58/58 [00:04<00:00, 12.12it/s]
Processing dia3_utt5.mp4: 100%|██████████| 70/70 [00:06<00:00, 11.49it/s]
Processing dia4_utt1.mp4: 100%|██████████| 3/3 [00:00<00:00, 15.13it/s]
Processing dia4_utt8.mp4: 100%|██████████| 193/193 [00:11<00:00, 16.37it/s]
Processing dia4_utt14.mp4: 100%|██████████| 16/16 [00:00<00:00, 16.90it/s]
Processing dia5_utt1.mp4: 100%|██████████| 4/4 [00:00<00:00, 13.48it/s]
Processing dia6_utt7.mp4: 100%|██████████| 121/121 [00:06<00:00, 17.54it/s]
Processing dia6_utt9.mp4: 100%|██████████| 24/24 [00:01<00:00, 18.00it/s]
Processing dia6_utt11.mp4: 100%|██████████| 81/81 [00:04<00:00, 18.09it/s]
Processing dia6_utt18.mp4: 100%|██████████| 28/28 [00:01<00:00, 16.14it/s]
Processing dia11_utt0.mp4: 100

Starting training...
Epoch 1/3


2024-11-11 21:10:28.453646: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3


  saving_api.save_model(


Epoch 3/3

Starting feature extraction...

Extracting features from videos...


Processing dia0_utt7.mp4: 100%|██████████| 137/137 [00:10<00:00, 13.56it/s]
Processing dia0_utt11.mp4: 100%|██████████| 199/199 [00:16<00:00, 12.38it/s]
Processing dia2_utt8.mp4: 100%|██████████| 16/16 [00:01<00:00,  9.97it/s]
Processing dia3_utt3.mp4: 100%|██████████| 58/58 [00:05<00:00, 11.07it/s]
Processing dia3_utt5.mp4: 100%|██████████| 70/70 [00:07<00:00,  9.70it/s]
Processing dia4_utt1.mp4: 100%|██████████| 3/3 [00:00<00:00, 17.48it/s]
Processing dia4_utt8.mp4: 100%|██████████| 193/193 [00:11<00:00, 16.47it/s]
Processing dia4_utt14.mp4: 100%|██████████| 16/16 [00:00<00:00, 17.57it/s]
Processing dia5_utt1.mp4: 100%|██████████| 4/4 [00:00<00:00, 13.71it/s]
Processing dia6_utt7.mp4: 100%|██████████| 121/121 [00:07<00:00, 17.00it/s]
Processing dia6_utt9.mp4: 100%|██████████| 24/24 [00:01<00:00, 17.60it/s]
Processing dia6_utt11.mp4: 100%|██████████| 81/81 [00:04<00:00, 16.66it/s]
Processing dia6_utt18.mp4: 100%|██████████| 28/28 [00:01<00:00, 16.31it/s]
Processing dia11_utt0.mp4: 100


Features saved to emotion_features.csv

Feature Extraction Summary:
Total videos processed: 997
neutral: 0.540
anger: 0.122
joy: 0.191
sadness: 0.071
surprise: 0.076





In [8]:
# Define path to video clips
df = pd.read_csv('../set_2_test/test_emotion.csv',encoding= "latin-1")
video_dir = '../set_2_test/test_data/'


# Function to get video file path from IDs
def get_video_clip_path(row):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Apply the function to get file paths for each sampled clip
df['video_clip_path'] = df.apply(get_video_clip_path, axis=1)

# Check sample paths
print(df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())

   Dialogue_ID  Utterance_ID                         video_clip_path
0            3             0   ../set_2_test/test_data/dia3_utt0.mp4
1            5             7   ../set_2_test/test_data/dia5_utt7.mp4
2           13             2  ../set_2_test/test_data/dia13_utt2.mp4
3           13             6  ../set_2_test/test_data/dia13_utt6.mp4
4           14             1  ../set_2_test/test_data/dia14_utt1.mp4


In [13]:
test_emotion_probabilities = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    video_path = row['video_clip_path']
    probabilities = extract_features(video_path, model)
    test_emotion_probabilities.append(probabilities)

test_emotion_probabilities = np.array(test_emotion_probabilities)


Processing dia3_utt0.mp4: 100%|██████████| 48/48 [00:03<00:00, 15.74it/s]
Processing dia5_utt7.mp4: 100%|██████████| 92/92 [00:07<00:00, 12.19it/s]
Processing dia13_utt2.mp4: 100%|██████████| 54/54 [00:03<00:00, 14.27it/s]
Processing dia13_utt6.mp4: 100%|██████████| 107/107 [00:06<00:00, 16.32it/s]
Processing dia14_utt1.mp4: 100%|██████████| 65/65 [00:05<00:00, 11.92it/s]
Processing dia16_utt0.mp4: 100%|██████████| 49/49 [00:03<00:00, 12.65it/s]
Processing dia17_utt13.mp4: 100%|██████████| 141/141 [00:06<00:00, 21.21it/s]
Processing dia21_utt6.mp4: 100%|██████████| 56/56 [00:03<00:00, 15.27it/s]
Processing dia25_utt15.mp4: 100%|██████████| 3/3 [00:00<00:00, 11.43it/s]
Processing dia28_utt0.mp4: 100%|██████████| 88/88 [00:05<00:00, 15.50it/s]
Processing dia33_utt3.mp4: 100%|██████████| 40/40 [00:02<00:00, 14.84it/s]
Processing dia34_utt5.mp4: 100%|██████████| 82/82 [00:06<00:00, 11.98it/s]
Processing dia34_utt11.mp4: 100%|██████████| 48/48 [00:03<00:00, 13.13it/s]
Processing dia41_utt9.

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (100,) + inhomogeneous part.

In [19]:
# Get the maximum number of frames across all videos
max_frames = 0
for probabilities in test_emotion_probabilities:
    if probabilities is not None:
        max_frames = max(max_frames, len(probabilities))

# Pad the arrays with zeros to make them all the same size
test_emotion_probabilities_padded = []
for probabilities in test_emotion_probabilities:
    if probabilities is None:
        # If probabilities is None, create a zero-filled array with the max number of frames
        test_emotion_probabilities_padded.append(np.zeros((max_frames, 5)))
    else:
        if len(probabilities) < max_frames:
            padded = np.pad(probabilities, ((0, max_frames - len(probabilities)), (0, 0)), mode='constant')
            test_emotion_probabilities_padded.append(padded)
        else:
            test_emotion_probabilities_padded.append(probabilities)

# Convert to a NumPy array
test_emotion_probabilities = np.array(test_emotion_probabilities_padded)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (100, 7) + inhomogeneous part.

In [18]:
emotion_labels = ['neutral', 'anger', 'joy', 'sadness', 'surprise']
test_predicted_labels = [emotion_labels[np.argmax(np.mean(probabilities, axis=0))] for probabilities in test_emotion_probabilities]

AxisError: axis 0 is out of bounds for array of dimension 0

In [9]:
all_preds = ["your_prediction" for i in df['Utterance_ID']]
all_ids = df["Sr No."]
submission_df = pd.DataFrame({
        'Sr No.': all_ids,
        'Emotion': all_preds
    })
    
# Save the DataFrame to CSV
submission_df.to_csv("submission.csv", index=False)