In [12]:
import pandas as pd

# Input data
data = {
    # 'Alternatives': ['Monitor A', 'Monitor B', 'Monitor C', 'Monitor D', 'Monitor E', 'Monitor F'],
    'Refresh Rate (Hz)': [360.00, 180.00, 165.00, 240.00, 75.00, 180.00],
    'Resolution (Pixel)': ["Full HD (1920 x 1080)", "Ultra HD (2560 x 1440)", "Full HD (1920 x 1080)", "Full HD (1920 x 1080)", "Ultra HD (2560 x 1440)", "Ultra HD (2560 x 1440)"],
    'Screen Type': ['TN', 'IPS', 'IPS', 'OLED', 'OLED', 'OLED'],
    'Size (Inch)': [24, 24, 27, 24, 32, 27],
    'Weight (kg)': [4, 5, 7, 5, 9, 7],
    'Guarantee (Years)': [5, 5, 2, 3, 2, 5],
    'Price (Rp.)': [1500000.00, 1800000.00, 1800000.00, 1900000.00, 3400000.00, 2400000.00],
    'Features': [2, 2, 4, 3, 3, 4] # This column is already scaled
}


In [13]:
raw_df = pd.read_csv(r'D:\Programming\Web Dev\SPPK\data_raw copy.csv')

In [14]:
raw_df

Unnamed: 0,Alternatives,Refresh Rate,Resolution,Screen type,Size,Weight,Price,Warranty,Features
0,Monitor A,360,Full HD (1920 x 1080),TN,24,4,1500000,5,2
1,Monitor B,180,Ultra HD (2560 x 1440),IPS,24,5,1800000,5,2
2,Monitor C,165,Full HD (1920 x 1080),IPS,27,7,1800000,2,4
3,Monitor D,240,Full HD (1920 x 1080),OLED,24,5,1900000,3,3
4,Monitor E,75,Ultra HD (2560 x 1440),OLED,32,9,3400000,2,3
5,Monitor F,180,Ultra HD (2560 x 1440),OLED,27,7,2400000,5,4


In [None]:
def create_categorical_dataframe(df):
    """
    Converts a raw DataFrame into a categorical DataFrame based on specified rules.

    Args:
        data (dict): A dictionary containing the raw data.
        categorization_rules (dict): A dictionary defining the categorization rules for each column.

    Returns:
        pandas.DataFrame: The new DataFrame with categorized values.
    """
    # df = pd.DataFrame(data)
    categorical_df = pd.DataFrame()

    # Copy 'Alternatives' column directly as it's not categorized
    categorical_df['Alternatives'] = df['Alternatives']
    
    # Categorization rules
    categorization_rules = {
        'Refresh Rate': {
            (0, 76): 1,
            (76, 145): 2,
            (145, 241): 3,
            (241, 361): 4,
            (361, float('inf')): 5
        },
        'Resolution': {
            "HD (1280 x 720)": 1,
            "Full HD (1920 x 1080)": 2,
            "Ultra HD (2560 x 1440)": 3,
            "Quad HD (3840 x 2160)": 4,
            "8K (7680 x 4320)": 5
        },
        'Screen type': {
            'OLED': 5,
            'IPS': 3,
            'TN': 1,
            'VA': 1,
        },
        'Size': {
            (0, 20): 1,
            (20, 25): 2,
            (25, 30): 3,
            (30, 35): 4,
            (35, float('inf')): 5
        },
        'Weight': {
            (0, 5): 1,
            (5, 7): 2,
            (7, 9): 3,
            (9, 11): 4,
            (11, float('inf')): 5
        },
        'Price': {
            (0, 2000001): 1,
            (2000001, 4000001): 2,
            (4000001, 6000001): 3,
            (6000001, 8000001): 4,
            (8000001, float('inf')): 5
        },
        'Warranty': {
            (0, 2): 1,
            (2, 4): 2,
            (4, 6): 3,
            (6, 8): 4,
            (8, float('inf')): 5
        },
    }

    # Iterate through each column specified in the categorization rules
    for column, rules in categorization_rules.items():
        # Check if the rules are for numerical binning (tuple keys) or direct mapping (other keys)
        if isinstance(list(rules.keys())[0], tuple):
            # For numerical binning, extract bins and labels
            # Ensure bins are sorted and include the upper bound of the last interval
            bins = sorted([r[0] for r in rules.keys()] + [list(rules.keys())[-1][1]])
            labels = [rules[k] for k in sorted(rules.keys())]
            # Use pd.cut to categorize numerical data into bins
            categorical_df[column] = pd.cut(df[column], bins=bins, labels=labels, right=False)
        else:
            # For direct mapping, use the map method
            categorical_df[column] = df[column].map(rules)

    # Add the 'Features' column which is already scaled and doesn't need categorization
    if 'Features' in df.columns and 'Features' not in categorical_df.columns:
        categorical_df['Features'] = df['Features']

    return categorical_df


In [16]:
# Generate the categorical DataFrame
categorical_result_df = create_categorical_dataframe(raw_df)

categorical_result_df

Unnamed: 0,Alternatives,Refresh Rate,Resolution,Screen type,Size,Weight,Price,Warranty,Features
0,Monitor A,4,2,1,2,2,1,3,2
1,Monitor B,3,3,3,2,2,1,3,2
2,Monitor C,3,2,3,3,3,1,2,4
3,Monitor D,3,2,5,2,2,1,2,3
4,Monitor E,1,3,5,4,4,2,2,3
5,Monitor F,3,3,5,3,3,2,3,4
