In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import face_recognition

VIDEO_FOLDER = r"C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos"  # Path to videos folder
DATASET_PATH = r"C:\Users\cheta\Downloads\Assignment Data - Sheet1.csv"  # Path to dataset given


In [None]:
# Loading performance data
def load_performance_data():
    """Load the dataset with performance scores."""
    data = pd.read_csv(DATASET_PATH)
    print(f"Loaded dataset with {len(data)} rows.")
    print(f"Columns in dataset: {data.columns.tolist()}")
    return data

performance_data = load_performance_data()
performance_data.head()


Loaded dataset with 268 rows.
Columns in dataset: ['Performance', 'Video URL']


Unnamed: 0,Performance,Video URL
0,1.106,https://fgimagestorage.blob.core.windows.net/f...
1,2.2447,https://fgimagestorage.blob.core.windows.net/f...
2,2.0126,https://fgimagestorage.blob.core.windows.net/f...
3,1.7708,https://fgimagestorage.blob.core.windows.net/f...
4,0.6293,https://fgimagestorage.blob.core.windows.net/f...


In [None]:
# Function to extract face encodings from a videos
def extract_faces_from_video(video_path):
    """Extract face encodings from a video."""
    face_encodings = []
    video_capture = cv2.VideoCapture(video_path)
    
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        # Resizing frame for faster processing
        small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
        rgb_frame = small_frame[:, :, ::-1]  # Convert BGR to RGB

        # Detecting face locations and encodings
        face_locations = face_recognition.face_locations(rgb_frame)
        encodings = face_recognition.face_encodings(rgb_frame, face_locations)
        face_encodings.extend(encodings)

    video_capture.release()
    return face_encodings


In [None]:
# Identifying unique influencers
def identify_unique_influencers():
    """
    Process videos and identify unique influencers.
    Groups the same type of influencer across videos and assigns a unique ID.
    """
    known_faces = []  # List to store unique face encodings
    influencer_map = {}  # Maps video index to identified influencer IDs

    for index, row in performance_data.iterrows():
        video_path = os.path.join(VIDEO_FOLDER, f"{index}.mp4")
        if not os.path.exists(video_path):
            print(f"Video not found: {video_path}")
            continue

        print(f"Processing video {index}: {video_path}")
        video_face_encodings = extract_faces_from_video(video_path)

        for encoding in video_face_encodings:
            # Check if this face matches any known faces
            matches = face_recognition.compare_faces(known_faces, encoding, tolerance=0.6)

            if True in matches:
                # If a match is found, get the influencer ID
                influencer_id = matches.index(True)
            else:
                # If no match is found, assign a new ID and store the encoding
                influencer_id = len(known_faces)
                known_faces.append(encoding)

            # Map this influencer ID to the current video
            if index not in influencer_map:
                influencer_map[index] = []
            influencer_map[index].append(influencer_id)

    print(f"Total unique influencers identified: {len(known_faces)}")
    return influencer_map, known_faces

influencer_map, known_faces = identify_unique_influencers()


Processing video 0: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\0.mp4
Processing video 1: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\1.mp4
Processing video 2: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\2.mp4
Processing video 3: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\3.mp4
Processing video 4: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\4.mp4
Processing video 5: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\5.mp4
Processing video 6: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\6.mp4
Processing video 7: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\7.mp4
Processing video 8: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\8.mp4
Processing video 9: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\9.mp4
Processing video 10: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\10.mp4
Processing video 11: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\11.mp4
Processing video 12: C:\Users\cheta\OneDrive\Desktop\Prog\pyth\videos\12.mp4
Processing video 13

In [None]:
# Verification To Inspect the influencer map
print("\nMapping of video indices to influencer IDs:")
for video_index, influencer_ids in influencer_map.items():
    print(f"Video {video_index}: Influencer IDs {influencer_ids}")



Mapping of video indices to influencer IDs:
Video 2: Influencer IDs [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Video 3: Influencer IDs [1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Video 11: Influencer IDs [1, 1, 1, 4, 1, 1, 4, 1, 4, 1, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 4, 4, 0, 1, 4, 

In [None]:
# Calculating Average Performance for Each Influencer

def calculate_average_performance():
    influencer_performance = {}

    for video_index, influencer_ids in influencer_map.items():
        performance_score = performance_data.loc[video_index, 'Performance']
        
        for influencer_id in influencer_ids:
            if influencer_id not in influencer_performance:
                influencer_performance[influencer_id] = []
            influencer_performance[influencer_id].append(performance_score)
    
    average_performance = {
        influencer_id: np.mean(scores)
        for influencer_id, scores in influencer_performance.items()
    }

    return average_performance

# Computing average performance for influencer
average_performance = calculate_average_performance()
print("\nAverage performance per influencer calculated successfully.")



Average performance per influencer calculated successfully.


In [None]:
# Listing All Unique Influencers by Average Performance

def list_all_influencers(average_performance):
    # Sorting influencers by average performance in descending order
    ranked_influencers = sorted(
        average_performance.items(),
        key=lambda item: item[1], 
        reverse=True               
    )
    return ranked_influencers

ranked_influencers = list_all_influencers(average_performance)

# Display all unique influencers with their average performance
print("\nList of All Unique Influencers by Average Performance:")
for rank, (influencer_id, avg_performance) in enumerate(ranked_influencers, start=1):
    print(f"Rank {rank}: Influencer {influencer_id} with Average Performance {avg_performance:.2f}")



List of All Unique Influencers by Average Performance:
Rank 1: Influencer 7 with Average Performance 1.59
Rank 2: Influencer 6 with Average Performance 1.37
Rank 3: Influencer 207 with Average Performance 1.23
Rank 4: Influencer 50 with Average Performance 1.07
Rank 5: Influencer 97 with Average Performance 0.96
Rank 6: Influencer 0 with Average Performance 0.91
Rank 7: Influencer 1 with Average Performance 0.90
Rank 8: Influencer 2 with Average Performance 0.89
Rank 9: Influencer 82 with Average Performance 0.84
Rank 10: Influencer 5 with Average Performance 0.77
Rank 11: Influencer 30 with Average Performance 0.76
Rank 12: Influencer 251 with Average Performance 0.75
Rank 13: Influencer 57 with Average Performance 0.73
Rank 14: Influencer 11 with Average Performance 0.70
Rank 15: Influencer 15 with Average Performance 0.63
Rank 16: Influencer 146 with Average Performance 0.57
Rank 17: Influencer 26 with Average Performance 0.58
Rank 18: Influencer 32 with Average Performance 0.47
Ra

In [None]:
from scipy.stats import pearsonr

def calculate_test_retest_reliability():
    reliability_scores = []

    for influencer_id, scores in influencer_performance.items():
        # Split into odd and even indexed scores
        odd_scores = scores[::2]
        even_scores = scores[1::2]
        
        # Truncate the longer list to match the shorter one's length
        min_length = min(len(odd_scores), len(even_scores))
        if min_length > 1:  # Ensure at least two data points for correlation
            odd_scores = odd_scores[:min_length]
            even_scores = even_scores[:min_length]
            
            # Check if inputs are constant
            if np.std(odd_scores) == 0 or np.std(even_scores) == 0:
                reliability = 0  # Assign 0 if one of the arrays is constant
            else:
                reliability, _ = pearsonr(odd_scores, even_scores)  # Pearson's correlation
        else:
            reliability = 0  # Assign 0 if not enough data for correlation
        
        avg_performance = np.mean(scores)
        reliability_scores.append((influencer_id, avg_performance, reliability))
    
    return reliability_scores

# Ranking influencers based on average performance and test-retest reliability
reliability_stats = calculate_test_retest_reliability()

# Sorting by average performance (descending) and reliability (descending)
reliability_stats_sorted = sorted(
    reliability_stats, key=lambda x: (-x[1], -x[2])
)

# Output top influencers based on test-retest reliability
for influencer_id, avg_performance, reliability in reliability_stats_sorted:
    print(f"Influencer: {influencer_id}, Avg Performance: {avg_performance:.2f}, Reliability: {reliability:.2f}")

Influencer: 7, Avg Performance: 1.59, Reliability: 0.98
Influencer: 6, Avg Performance: 1.37, Reliability: 0.94
Influencer: 207, Avg Performance: 1.23, Reliability: 0.8
Influencer: 50, Avg Performance: 1.07, Reliability: 0.97
Influencer: 97, Avg Performance: 0.96, Reliability: 0.96
Influencer: 0, Avg Performance: 0.91, Reliability: 0.99
Influencer: 1, Avg Performance: 0.90, Reliability: 0.98
Influencer: 2, Avg Performance: 0.89, Reliability: 0.68
Influencer: 82, Avg Performance: 0.84, Reliability: 1.00
Influencer: 5, Avg Performance: 0.77, Reliability: 0.81
Influencer: 30, Avg Performance: 0.76, Reliability: 0.00
Influencer: 251, Avg Performance: 0.75, Reliability: 0.58
Influencer: 57, Avg Performance: 0.73, Reliability: 0.99
Influencer: 11, Avg Performance: 0.70, Reliability: 0.60
Influencer: 15, Avg Performance: 0.63, Reliability: 1.00
Influencer: 146, Avg Performance: 0.57, Reliability: 0.91
Influencer: 26, Avg Performance: 0.53, Reliability: 0.94
Influencer: 32, Avg Performance: 0.

In [36]:
from scipy.stats import pearsonr

# Calculate test-retest reliability for each influencer
reliability_scores = {}

for influencer_id, scores in influencer_performance.items():
    # Split scores into odd-indexed and even-indexed scores
    odd_scores = scores[::2]
    even_scores = scores[1::2]
    
    # Ensure both lists have the same length by truncating the longer list
    min_length = min(len(odd_scores), len(even_scores))
    odd_scores = odd_scores[:min_length]
    even_scores = even_scores[:min_length]
    
    # Only calculate reliability if there are at least 2 pairs of scores
    if len(odd_scores) > 1:
        # Check for constant input arrays
        if len(set(odd_scores)) == 1 or len(set(even_scores)) == 1:
            reliability = 0  # Assign 0 if either list is constant
        else:
            reliability, _ = pearsonr(odd_scores, even_scores)  # Pearson's correlation
    else:
        reliability = 0  # Assign 0 if not enough data for correlation
    
    reliability_scores[influencer_id] = reliability

print("Reliability scores calculated.")

Reliability scores calculated.


In [None]:
import csv
import pandas as pd

unique_influencer_data = []

for influencer_id, avg_performance in average_performance.items():
    # Get the first video URL for this influencer
    first_video_index = next(
        index for index, influencers in influencer_map.items() if influencer_id in influencers
    )
    first_video_url = performance_data.loc[first_video_index, 'Video URL']
    
    # Get reliability score for this influencer
    reliability_score = reliability_scores.get(influencer_id, 0)  # Default to 0 if not available
    
    unique_influencer_data.append((influencer_id, avg_performance, reliability_score, first_video_url))

# Sort the data by descending average performance, then by descending reliability
unique_influencer_data.sort(key=lambda x: (-x[1], -x[2]))

# Saving the data to a CSV file
sorted_unique_influencers_csv = "sorted_unique_influencers_with_reliability.csv"

with open(sorted_unique_influencers_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Influencer ID", "Average Performance", "Reliability", "First Video URL"])
    for influencer_id, avg_performance, reliability_score, first_video_url in unique_influencer_data:
        writer.writerow([influencer_id, avg_performance, reliability_score, first_video_url])

print(f"\nSorted unique influencers with reliability and first video URL saved to {sorted_unique_influencers_csv}")



Sorted unique influencers with reliability and first video URL saved to sorted_unique_influencers_with_reliability.csv
