In [50]:
import pandas as pd 
import numpy as np

In [51]:
df = pd.read_csv('new_iso.csv')

In [52]:
df.head(10)

Unnamed: 0,Original Blend Name,Blend Number,PC,GPP,SC,CC,FA,VA,GBA,FBC,QUARTZ,PLC,Cumulative Heat (J/g),Unnamed: 13
0,100% PC,0,100,0,0,0,0,0,0,0,0,0,323.16,
1,80% PC + 20% GPP,1,80,20,0,0,0,0,0,0,0,0,297.77,
2,70% PC + 30% GPP,2,70,30,0,0,0,0,0,0,0,0,311.4,
3,60% PC + 40% GPP,3,60,40,0,0,0,0,0,0,0,0,265.35,
4,50% PC + 50% GPP,4,50,50,0,0,0,0,0,0,0,0,225.05,
5,40% PC + 60% GPP,5,40,60,0,0,0,0,0,0,0,0,216.62,
6,30% PC + 70% GPP,6,30,70,0,0,0,0,0,0,0,0,160.64,
7,40% GPP + 60% SC,7,0,40,60,0,0,0,0,0,0,0,11.24,
8,10% PC + 40% GPP + 50% SC,8,10,40,50,0,0,0,0,0,0,0,115.7,
9,20% PC + 40% GPP + 40% SC,9,20,40,40,0,0,0,0,0,0,0,149.98,


In [53]:
print(df.columns)

Index(['Original Blend Name', 'Blend Number', 'PC', 'GPP', 'SC', 'CC', 'FA',
       'VA', 'GBA', 'FBC', 'QUARTZ', 'PLC', 'Cumulative Heat (J/g)',
       'Unnamed: 13'],
      dtype='object')


In [54]:
df1 = df.drop(columns=['Unnamed: 13'])
df1.head()

Unnamed: 0,Original Blend Name,Blend Number,PC,GPP,SC,CC,FA,VA,GBA,FBC,QUARTZ,PLC,Cumulative Heat (J/g)
0,100% PC,0,100,0,0,0,0,0,0,0,0,0,323.16
1,80% PC + 20% GPP,1,80,20,0,0,0,0,0,0,0,0,297.77
2,70% PC + 30% GPP,2,70,30,0,0,0,0,0,0,0,0,311.4
3,60% PC + 40% GPP,3,60,40,0,0,0,0,0,0,0,0,265.35
4,50% PC + 50% GPP,4,50,50,0,0,0,0,0,0,0,0,225.05


In [55]:
def classify_blend_type(blend_name):
    # Count the number of components in the blend (separated by '+')
    components = blend_name.split('+')
    # Classify based on the count of components
    if len(components) == 2:
        return 'Binary'
    elif len(components) == 3:
        return 'Ternary'
    else:
        return 'Other'

# Apply the function to create a new column
df1['Blend Type'] = df1['Original Blend Name'].apply(classify_blend_type)

In [56]:
df1.head(10)

Unnamed: 0,Original Blend Name,Blend Number,PC,GPP,SC,CC,FA,VA,GBA,FBC,QUARTZ,PLC,Cumulative Heat (J/g),Blend Type
0,100% PC,0,100,0,0,0,0,0,0,0,0,0,323.16,Other
1,80% PC + 20% GPP,1,80,20,0,0,0,0,0,0,0,0,297.77,Binary
2,70% PC + 30% GPP,2,70,30,0,0,0,0,0,0,0,0,311.4,Binary
3,60% PC + 40% GPP,3,60,40,0,0,0,0,0,0,0,0,265.35,Binary
4,50% PC + 50% GPP,4,50,50,0,0,0,0,0,0,0,0,225.05,Binary
5,40% PC + 60% GPP,5,40,60,0,0,0,0,0,0,0,0,216.62,Binary
6,30% PC + 70% GPP,6,30,70,0,0,0,0,0,0,0,0,160.64,Binary
7,40% GPP + 60% SC,7,0,40,60,0,0,0,0,0,0,0,11.24,Binary
8,10% PC + 40% GPP + 50% SC,8,10,40,50,0,0,0,0,0,0,0,115.7,Ternary
9,20% PC + 40% GPP + 40% SC,9,20,40,40,0,0,0,0,0,0,0,149.98,Ternary


In [57]:
print(df1.groupby("Blend Type")['Original Blend Name'].count())

Blend Type
Binary     23
Other       2
Ternary    67
Name: Original Blend Name, dtype: int64


In [58]:
import regex as re

# Function to extract PC or PLC percentage and calculate replacement level
def calculate_replacement_level_from_name(blend_name):
    # Find the percentage of PC or PLC using regex
    pc_plc_match = re.search(r'(\d+)%\s*(PC|PLC)', blend_name, re.IGNORECASE)
    if pc_plc_match:
        percentage = int(pc_plc_match.group(1))  # Extract percentage as integer
        return 100 - percentage  # Replacement level is 100 - PC or PLC percentage
    else:
        return None  # If no PC or PLC is mentioned, return None


df1['Replacement Level'] = df1['Original Blend Name'].apply(calculate_replacement_level_from_name)

In [59]:
df1.head()

Unnamed: 0,Original Blend Name,Blend Number,PC,GPP,SC,CC,FA,VA,GBA,FBC,QUARTZ,PLC,Cumulative Heat (J/g),Blend Type,Replacement Level
0,100% PC,0,100,0,0,0,0,0,0,0,0,0,323.16,Other,0.0
1,80% PC + 20% GPP,1,80,20,0,0,0,0,0,0,0,0,297.77,Binary,20.0
2,70% PC + 30% GPP,2,70,30,0,0,0,0,0,0,0,0,311.4,Binary,30.0
3,60% PC + 40% GPP,3,60,40,0,0,0,0,0,0,0,0,265.35,Binary,40.0
4,50% PC + 50% GPP,4,50,50,0,0,0,0,0,0,0,0,225.05,Binary,50.0


In [62]:
df1.to_csv("new_iso1.csv",index=False)

In [60]:
print(df1.groupby(['PC'])['Original Blend Name'].count())

PC
0      43
10      1
20      1
30     28
40      6
50      6
60      2
70      2
80      2
100     1
Name: Original Blend Name, dtype: int64


In [61]:
print(df1.groupby(['PLC'])['Original Blend Name'].count())

PLC
0      50
30     23
40      4
50      6
60      2
70      2
80      2
90      2
100     1
Name: Original Blend Name, dtype: int64


In [49]:
def grouping_fu(cols):
    c = []
    for i in cols:
        c = i.split("+")
        
        print(c)

grouping_fu(df['Original Blend Name'])

['100% PC']
['80% PC ', ' 20% GPP']
['70% PC ', ' 30% GPP']
['60% PC ', ' 40% GPP']
['50% PC ', ' 50% GPP']
['40% PC ', ' 60% GPP']
['30% PC ', ' 70% GPP']
['80% PC ', ' 20% QUARTZ']
['60% PLC ', ' 40% GPP']
['100% PLC']


In [47]:
import pandas as pd
import re

# 1. Create a sample DataFrame with your blend names
blend_data = [
    "100% PC",
    "80% PC + 20% GPP",
    "70% PC + 30% GPP",
    "60% PC + 40% GPP",
    "50% PC + 50% GPP",
    "40% PC + 60% GPP",
    "30% PC + 70% GPP",
    "40% GPP + 60% SC",
    "10% PC + 40% GPP + 50% SC",
    "20% PC + 40% GPP + 40% SC",
    # ... (include all your blends here) ...
    "50% PLC + 50% QUARTZ"
]

df = pd.DataFrame({"Original Blend Name": blend_data})

# 2. Define a list of possible materials you want to parse
materials = ["PC","PLC","GPP","SC","FA","CC","VA","GBA","FBC","QUARTZ"]

# 3. Create a function to parse each blend string
def parse_blend(blend):
    # Convert to uppercase to make matching easier
    blend_upper = blend.upper()
    
    # Use a regex to find all "<number>% <material>" patterns
    # If you have decimals like 10.5%, use (\d+(?:\.\d+)?) instead of (\d+)
    pattern = r"(\d+)%\s*([A-Z]+)"
    found = re.findall(pattern, blend_upper)
    
    # Initialize each material percentage to 0
    parsed_dict = {mat: 0 for mat in materials}
    
    # For each match, add the amount to the appropriate key
    for amount_str, mat in found:
        # Some materials might appear multiple times—summing just in case
        if mat in parsed_dict:
            parsed_dict[mat] += int(amount_str)
    
    return parsed_dict

# 4. Apply the parse function to each row, creating columns for each material
parsed_data = df["Original Blend Name"].apply(parse_blend)
parsed_df = pd.DataFrame(parsed_data.tolist())  # Convert list of dicts into a DataFrame

# 5. Concatenate original DataFrame with the new columns
df_final = pd.concat([df, parsed_df], axis=1)

# 6. Display results
print(df_final.head(15))  # or however many rows you want to see


          Original Blend Name   PC  PLC  GPP  SC  FA  CC  VA  GBA  FBC  QUARTZ
0                     100% PC  100    0    0   0   0   0   0    0    0       0
1            80% PC + 20% GPP   80    0   20   0   0   0   0    0    0       0
2            70% PC + 30% GPP   70    0   30   0   0   0   0    0    0       0
3            60% PC + 40% GPP   60    0   40   0   0   0   0    0    0       0
4            50% PC + 50% GPP   50    0   50   0   0   0   0    0    0       0
5            40% PC + 60% GPP   40    0   60   0   0   0   0    0    0       0
6            30% PC + 70% GPP   30    0   70   0   0   0   0    0    0       0
7            40% GPP + 60% SC    0    0   40  60   0   0   0    0    0       0
8   10% PC + 40% GPP + 50% SC   10    0   40  50   0   0   0    0    0       0
9   20% PC + 40% GPP + 40% SC   20    0   40  40   0   0   0    0    0       0
10       50% PLC + 50% QUARTZ    0   50    0   0   0   0   0    0    0      50


In [48]:
import pandas as pd
import re

# Example: Your DataFrame already has a column "Original Blend Name"
# For demonstration, here is a small sample:
data = {
    "Original Blend Name": [
        "100% PC",
        "80% PC + 20% GPP",
        "70% PC + 30% GPP",
        "60% PC + 40% GPP",
        "50% PC + 50% GPP",
        "40% PC + 60% GPP",
        "30% PC + 70% GPP",
        "80% PC + 20% QUARTZ",
        "60% PLC + 40% GPP",
        "100% PLC"
    ]
}
df = pd.DataFrame(data)

# Helper function to extract percentage of a material from a blend string.
def get_percentage(blend, material):
    blend = blend.upper()
    # Use different patterns if material is "PC" or "PLC" to avoid confusion.
    if material == "PC":
        # Use negative lookbehind to ensure it doesn't match "PLC"
        pattern = r"(\d+)%\s*(?<!L)PC\b"
    elif material == "PLC":
        pattern = r"(\d+)%\s*PLC\b"
    else:
        pattern = r"(\d+)%\s*"+material+r"\b"
    
    match = re.search(pattern, blend)
    if match:
        return int(match.group(1))
    else:
        return 0

# Function to assign group based on conditions.
def assign_group(blend):
    blend_upper = blend.upper()
    # Extract percentages for the materials of interest.
    pc = get_percentage(blend_upper, "PC")
    plc = get_percentage(blend_upper, "PLC")
    gpp = get_percentage(blend_upper, "GPP")
    quartz = get_percentage(blend_upper, "QUARTZ")
    
    # Apply grouping conditions:
    # Note: Order matters in if-elif. Adjust the order if needed.
    # Condition for exact percentages override "PC with GPP" rule.
    if pc == 70:
        return "Group 2"
    elif pc == 60:
        return "Group 3"
    elif pc == 50:
        return "Group 4"
    elif pc == 40:
        return "Group 5"
    elif pc > 0 and quartz > 0:
        return "Group 6"
    elif plc > 0 and gpp > 0:
        return "Group 7"
    # If none of the above exact conditions hold but both PC and GPP are present:
    elif pc > 0 and gpp > 0:
        return "Group 1"
    else:
        return "Uncategorized"

# Create the new "Group" column by applying assign_group to the blend names.
df["Group"] = df["Original Blend Name"].apply(assign_group)

# Display the DataFrame to verify
print(df)


   Original Blend Name          Group
0              100% PC  Uncategorized
1     80% PC + 20% GPP        Group 1
2     70% PC + 30% GPP        Group 2
3     60% PC + 40% GPP        Group 3
4     50% PC + 50% GPP        Group 4
5     40% PC + 60% GPP        Group 5
6     30% PC + 70% GPP        Group 1
7  80% PC + 20% QUARTZ        Group 6
8    60% PLC + 40% GPP        Group 7
9             100% PLC  Uncategorized
