### <center> Algorithm Steps </center>

- **Loaded the data**: Started by loading the student data from a CSV file into a Pandas DataFrame.
- **Displayed the original records**: Printed out the original student records to ensure the data was loaded correctly.
- **Sorted the data**: Sorted the DataFrame by profession in ascending order and by score in descending order.
- **Displayed the sorted data**: Printed the sorted student data to see how it was organized.
- **Initialized the groups**: Decided to create three groups and set up empty lists for each one.
- **Distributed the students**: For each unique profession, filtered the students and distributed them into the groups using a round-robin method.
- **Calculated the averages**: Created a function to calculate the average score for each group.
- **Calculated the standard deviation**: Found the initial standard deviation of the group averages to check how spread out they were.
- **Displayed the initial distribution**: Printed the initial distribution of students in each group, along with their average scores and the standard deviation.
- **Swapped students**: Defined a function to minimize the standard deviation by swapping students between groups.
  - Identified which group had the highest average score and which one had the lowest.
  - Swapped the highest-scoring student from the high group with the lowest-scoring student from the low group, focusing on students with the same profession.
- **Attempted improvement**: Called the swap function to see if it improved the group averages and standard deviation.
- **Displayed the updated distribution**: Printed the updated distribution of students in each group and the new standard deviation.
- **Compared the results**: Showed both the initial and final standard deviations to see how much improvement was made.


In [14]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [22]:
# Function that processes each file to reduce creating code again and again
def process_file(file_path):
    # Loading the CSV file into a pandas DataFrame
    students_df = pd.read_csv(file_path)

    # Displaying the original records
    print("\nOriginal Student Data:")
    print(students_df.to_string(index=False))

    # Total Number of groups (k), we can change its values
    k = 3 

    # Step 1: Sorting students by profession and score
    students_sorted = students_df.sort_values(by=['profession', 'score'], ascending=[True, False])

    # Displaying the sorted data
    print("\nSorted Student Data:")
    print(students_sorted[['userId', 'score', 'profession']].to_string(index=False))

    # Step 2: Separating the students by profession and distributing them into groups
    professions = students_sorted['profession'].unique()
    groups = [[] for _ in range(k)]

    for profession in professions:
        profession_group = students_sorted[students_sorted['profession'] == profession]
        for i, (_, student) in enumerate(profession_group.iterrows()):
            groups[i % k].append(student)

    # Step 3: Calculating the average marks for each group
    def calculate_group_averages(groups):
        return [pd.DataFrame(group)['score'].mean() for group in groups]

    group_averages = calculate_group_averages(groups)
    initial_std_dev = np.std(group_averages)

    # Printing the initial group distribution
    print("\nInitial Group Distribution:")
    for i, group in enumerate(groups):
        print(f"\nGroup {i+1}:")
        for student in group:
            print(f"UserId: {student['userId']}, Score: {student['score']}, Profession: {student['profession']}")
        print(f"Average Score: {group_averages[i]:.2f}")

    print(f"\n Initial Standard Deviation of Group Averages: {initial_std_dev:.2f}")

    # Step 4: Strategy to swap students to minimize the standard deviation
    def swap_students(groups, group_averages):
        high_group_idx = np.argmax(group_averages)
        low_group_idx = np.argmin(group_averages)

        high_group = pd.DataFrame(groups[high_group_idx])
        low_group = pd.DataFrame(groups[low_group_idx])

        # Find common professions to swap
        for profession in high_group['profession'].unique():
            high_group_prof = high_group[high_group['profession'] == profession]
            low_group_prof = low_group[low_group['profession'] == profession]

            if not high_group_prof.empty and not low_group_prof.empty:
                # Select highest score from high group and lowest from low group
                high_student = high_group_prof.iloc[0]  # highest scoring student
                low_student = low_group_prof.iloc[-1]   # lowest scoring student

                # Perform the swap
                print(f"\nSwapping UserId {high_student['userId']} with UserId {low_student['userId']}")

                # Remove the students from their current groups by userId
                groups[high_group_idx] = [s for s in groups[high_group_idx] if s['userId'] != high_student['userId']]
                groups[low_group_idx] = [s for s in groups[low_group_idx] if s['userId'] != low_student['userId']]

                # Append the swapped students to their new groups
                groups[high_group_idx].append(low_student)
                groups[low_group_idx].append(high_student)

                # Recalculate group averages
                new_group_averages = calculate_group_averages(groups)
                return np.std(new_group_averages), new_group_averages

        return None, group_averages

    # Step 5: Adding condition to skip swap if standard deviation is <= 2
    if initial_std_dev <= 2:
        print(f"\nInitial Standard Deviation ({initial_std_dev:.2f}) is already less than or equal to 2. No swap performed.")
        final_std_dev = initial_std_dev
    else:
        # Attempting to improve the distribution
        final_std_dev, final_group_averages = swap_students(groups, group_averages)

        # Printing the updated group distribution after swap
        if final_std_dev is not None:
            print("\nUpdated Group Distribution After Swap:")
            for i, group in enumerate(groups):
                print(f"\nGroup {i + 1}:")
                for student in group:
                    print(f"UserId: {student['userId']}, Score: {student['score']}, Profession: {student['profession']}")
                print(f"Updated Average Score: {final_group_averages[i]:.2f}")

            print(f"\nFinal Standard Deviation of Group Averages: {final_std_dev:.2f}")

        else:
            print("\nNo improvements were made.")

    # Displaying both initial and updated standard deviations
    print(f"\nInitial Standard Deviation: {initial_std_dev:.2f}")
    print(f"Final Standard Deviation: {final_std_dev:.2f}" if final_std_dev is not None else f"Final Standard Deviation: {initial_std_dev:.2f}")
    print("\n ******************************************************************* \n")

# List of file paths for all CSV files
file_paths = [
    r'D:\Naila Task\Tuesday 22-Oct-2024\table1.csv',
    r'D:\Naila Task\Tuesday 22-Oct-2024\table2.csv',
    r'D:\Naila Task\Tuesday 22-Oct-2024\table3.csv',
    r'D:\Naila Task\Tuesday 22-Oct-2024\table4.csv',
    r'D:\Naila Task\Tuesday 22-Oct-2024\table5.csv',
    r'D:\Naila Task\Tuesday 22-Oct-2024\table6.csv'
]

# Looping through each file and processing it
for file_path in file_paths:
    print(f"\nProcessing file: {os.path.basename(file_path)}")
    process_file(file_path)



Processing file: table1.csv

Original Student Data:
 Unnamed: 0  userId  score profession
          0       1    100   Engineer
          1       2    100   Engineer
          2       3    100   Engineer
          3       4    100     Doctor
          4       5    100     Doctor
          5       6     85     Doctor
          6       7     70     Lawyer
          7       8     75     Lawyer
          8       9     60     Lawyer
          9      10     65   Engineer
         10      11     55   Engineer
         11      12     50     Doctor
         12      13     45     Lawyer
         13      14     40     Doctor
         14      15     35     Lawyer
         15      16     30   Engineer
         16      17     25     Doctor
         17      18     20     Lawyer

Sorted Student Data:
 userId  score profession
      4    100     Doctor
      5    100     Doctor
      6     85     Doctor
     12     50     Doctor
     14     40     Doctor
     17     25     Doctor
      1    100   Engi