In [None]:
import os
import hashlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# calculate checksum of a file
def calculate_checksum(file_path, algo='md5', buf_size=8192):
    hash_algo = hashlib.new(algo)
    with open(file_path, 'rb') as f:
        while chunk := f.read(buf_size):
            hash_algo.update(chunk)
    return hash_algo.hexdigest()

# get file info (size and checksum)
def get_file_info(folder_path):
    file_info = {}
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            size = os.path.getsize(file_path)
            checksum = calculate_checksum(file_path)
            file_info[file_path] = (size, checksum)
    return file_info

# detect potential copied files
def find_copied_files(file_info):
    size_checksum_map = defaultdict(list)
    for file_path, (size, checksum) in file_info.items():
        size_checksum_map[(size, checksum)].append(file_path)
    
    copied_files = {key: paths for key, paths in size_checksum_map.items() if len(paths) > 1}
    return copied_files

# plot file size distribution
def plot_file_size_distribution(file_info):
    sizes = [size for _, (size, _) in file_info.items()]
    
    plt.figure(figsize=(12, 6))
    sns.histplot(sizes, bins=50, kde=True)
    plt.title('File Size Distribution')
    plt.xlabel('File Size (bytes)')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

def plagiarism_check(folder_path):
    file_info = get_file_info(folder_path)
    copied_files = find_copied_files(file_info)
    
    if copied_files:
        print("Potential copied files found:")
        for (size, checksum), files in copied_files.items():
            print(f"\nFiles with size {size} bytes and checksum {checksum}:")
            for file in files:
                print(f"  - {file}")
    else:
        print("No potential copied files detected.")
        plot_file_size_distribution(file_info)

# Example usage
folder_path = 'Assignments'  # Replace with the path to the google drive zip
plagiarism_check(folder_path)