## **Introduction to Data Science - Nhập môn khoa học dữ liệu - CSC14119**
### **HCMUS - Trường Đại học khoa học tự nhiên - Nov 2024.**
### **Đồ án thực hành cuối kì - Handling Real-World Problem.**
#### **Due:** 24/12/2024.
#### **Lớp:** 22_21.
#### **Giảng viên hướng dẫn:** Thầy Lê Ngọc Thành - Thầy Lê Nhựt Nam.
#### **STT nhóm:** 9.
---
**1. Import Libraries:**


In [2]:
from Libraries_Used import *
from Shared_Functions import *

**2. Create the context for ingredients**

In [3]:
pd.set_option('display.max_rows', None)
path = os.path.join('..', 'Assert', 'Ingredient_Labels.csv')
ingre_lb = pd.read_csv(path, header=None)
ingre_lb = ingre_lb.transpose()
ingre_lb.rename(columns= {0:'label'}, inplace=True)
ingre_lb
ingredients = ingre_lb['label'].tolist()

In [4]:
url = "http://localhost:1234/v1/chat/completions"

headers = {
    "Content-Type": "application/json"
}

data = []

In [5]:
def fetch_response_for_ingredient(ingredient, url, headers):
    payload = {
        "model": "qwen2.5-14b-instruct",
        "messages": [
            {
                "role": "user",
                "content": (
                    f"Tell me the type, context, flavor, smell of {ingredient}. "
                    "with knowledge that must be true to reality and must not be fabricated"
                    "The result will be in the following format: "
                    "\"type : your answer, context : your answer, flavor : your answer, smell : your answer\"; "
                    "where type can only belong to one or more of the following factors ['Spice', 'Fruit', 'Vegetable', 'Dairy', 'Meat', 'Grain', 'Condiment', "
                    "'Seafood', 'Herb', 'Nut', 'Sweetener', 'Oil', 'Beverage', 'Fermenting Agent', "
                    "'Legume', 'Mushroom', 'Pasta', 'Bread', 'Sauce'], "
                    "flavor can only belong to one or more of the following factors ['Sweet', 'Salty', 'Spicy', 'Bitter', 'Sour', 'Umami'], "
                    "context can only belong to one or more of the following factors ['Binding', 'Thickening', 'Flavoring', 'Sweetening', 'Preserving', 'Topping', "
                    "'Fermentation', 'Main Ingredient', 'Garnishing', 'Base', 'Tenderizing', 'Emulsifying', 'Coating', "
                    "'Moisturizing', 'Coloring', 'Binding Agent'], "
                    "smell can only belong to one or more of the following factors ['Sweet', 'Sour', 'Spicy', 'Bitter', 'Umami', 'Fruity', 'Nutty', 'Smoky', "
                    "'Herbal', 'Earthy', 'Fishy', 'Yeasty', 'Citrusy', 'Milky', 'Pungent', 'Floral', 'Fresh', 'Savory', 'Neutral']." 
                    "No need to add any other information such as notes or cautions."
                )
            }
        ]
    }

    try:
        response = requests.post(url, json=payload, headers=headers, timeout=30)
        if response.status_code == 200:
            reply = response.json()["choices"][0]["message"]["content"]
            return ingredient, reply
        else:
            return ingredient, None
    except requests.exceptions.RequestException:
        return ingredient, None


def fetch_and_log_responses_batch(ingredients, url, headers, output_file="all_replies.txt", batch_size=32, delay=2):
    
    failed_ingredients = []
    
    with open(output_file, "w", encoding="utf-8") as reply_file:
        with tqdm(total=len(ingredients), desc="Fetching Responses", unit="ingredient") as progress_bar:
            for i in range(0, len(ingredients), batch_size):
                batch = ingredients[i:i + batch_size]
                
                for ingredient in batch:
                    try:
                        ingredient, reply = fetch_response_for_ingredient(ingredient, url, headers)
                        reply = reply.replace('\'', '')
                        if reply:
                            reply_file.write(repr(reply) + "\n\n")
                        else:
                            failed_ingredients.append(ingredient)
                    except Exception as e:
                        failed_ingredients.append(ingredient)
                    
                    progress_bar.update(1)
                    
                    time.sleep(delay)
    
    # Report failed ingredients
    if failed_ingredients:
        print("\nFailed to fetch responses for the following ingredients:")
        for ingredient in failed_ingredients:
            print(f" - {ingredient}")
    else:
        print("\nAll ingredients processed successfully!")

In [6]:
def process_responses(input_file="all_replies.txt", ingredients_list = None):

    def extract_response(response_text):
        try:
            response_text = response_text.replace('\\n', ' ')
            response_text = response_text.replace('\'', '')

            type_match = re.search(r"type\s*:\s*([^:,]+(?:,[^:,]+)*)", response_text, re.IGNORECASE)
            context_match = re.search(r"context\s*:\s*([^:,]+(?:,[^:,]+)*)", response_text, re.IGNORECASE)
            flavor_match = re.search(r"flavor\s*:\s*([^:,]+(?:,[^:,]+)*)", response_text, re.IGNORECASE)
            smell_match = re.search(r"smell\s*:\s*([^:,]+(?:,[^:,]+)*)", response_text, re.IGNORECASE)

            type_value = type_match.group(1).strip() if type_match else "N/A"
            context_value = context_match.group(1).strip() if context_match else "N/A"
            flavor_value = flavor_match.group(1).strip() if flavor_match else "N/A"
            smell_value = smell_match.group(1).strip() if smell_match else "N/A"

            return type_value, context_value, flavor_value, smell_value
        except Exception as e:
            return "N/A", "N/A", "N/A", "N/A"

    data = []
    with open(input_file, "r", encoding="utf-8") as file:
        responses = file.read().split("\n\n")
        for idx, response in enumerate(responses):
            if response.strip(): 
                type_value, context_value, flavor_value, smell_value = extract_response(response)
                ingredient_name = ingredients_list[idx] if ingredients_list and idx < len(ingredients_list) else "Unknown"
                data.append({
                    "Ingredient": ingredient_name,
                    "Type": type_value,
                    "Context": context_value,
                    "Flavor": flavor_value,
                    "Smell": smell_value
                })

    return pd.DataFrame(data)

In [7]:
# # Send request and write response to file
# fetch_and_log_responses_batch(ingredients, url, headers, output_file="all_replies.txt", batch_size=32, delay=2)

# # Process responses from files
# df = process_responses("all_replies.txt", ingredients)

# # Save results to CSV
# df.to_csv("ingredients_analysis.csv", index=False, encoding="utf-8-sig")
# print("Done! CSV file has been saved.")

**4. Pre-processing context.csv**

In [None]:
file_path = "ingredients_analysis.csv"
df = pd.read_csv(file_path)

def clean_column_values(value):
    if pd.isnull(value):
        return value
    return re.sub(r'(,\s*)?\b(type|context|flavor|smell)\b', '', value, flags=re.IGNORECASE).strip()

for col in df.columns[1:]:
    df[col] = df[col].apply(clean_column_values)

df.to_csv("cleaned_ingredients_data.csv", index=False, encoding="utf-8-sig")

print(df)


                          Ingredient  \
0                       active yeast   
1                       agave nectar   
2                  all-purpose flour   
3                             almond   
4                     almond extract   
5                       almond flour   
6                        almond milk   
7                          aloe vera   
8                           amaretto   
9                american beef belly   
10                           anchovy   
11                    annatto powder   
12                     annatto seeds   
13                             apple   
14               apple cider vinegar   
15                       apple juice   
16                           apricot   
17                       apricot jam   
18                  arrowroot powder   
19                         artichoke   
20                         asparagus   
21                           avocado   
22                    back ribs tips   
23                          back-fat   


In [15]:
context_df = pd.read_csv('cleaned_ingredients_data.csv')
numerical_df_columns = ['Spice', 'Fruit', 'Vegetable', 'Dairy', 'Meat', 'Grain', 'Condiment',
                    'Seafood', 'Herb', 'Nut', 'Sweetener', 'Oil', 'Beverage', 'Fermenting Agent',
                    'Legume', 'Mushroom', 'Pasta', 'Bread', 'Sauce', 'Sweet', 'Salty', 'Spicy', 'Bitter', 'Sour', 'Umami',
                    'Binding', 'Thickening', 'Flavoring', 'Sweetening', 'Preserving', 'Topping',
                    'Fermentation', 'Main Ingredient', 'Garnishing', 'Base', 'Tenderizing', 'Emulsifying', 'Coating',
                    'Moisturizing', 'Coloring', 'Binding Agent', 'Sweet', 'Sour', 'Spicy', 'Bitter', 'Umami', 'Fruity', 'Nutty', 'Smoky',
                    'Herbal', 'Earthy', 'Fishy', 'Yeasty', 'Citrusy', 'Milky', 'Pungent', 'Floral', 'Fresh', 'Savory', 'Neutral']

numerical_df = pd.DataFrame(0, index=context_df.index, columns=numerical_df_columns)

for idx, row in context_df.iterrows():
    
    all_factors = []
    for col in ['Type', 'Context', 'Flavor', 'Smell']:
        elements = str(row[col]).split(',')
        all_factors.extend([e.strip() for e in elements if e.strip()])
    context_sum = list(set(all_factors))
    
    for context in context_sum:
        if context in numerical_df_columns:
            numerical_df.at[idx, context] = 1

print(numerical_df)
numerical_df.to_csv('numerical_context_data.csv', index=False, encoding='utf-8-sig')

     Spice  Fruit  Vegetable  Dairy  Meat  Grain  Condiment  Seafood  Herb  \
0        0      0          0      0     0      0          0        0     0   
1        0      0          0      0     0      0          0        0     0   
2        0      0          0      0     0      1          0        0     0   
3        0      0          0      0     0      0          0        0     0   
4        0      0          0      0     0      0          0        0     0   
5        0      0          0      0     0      0          0        0     0   
6        0      0          0      0     0      0          0        0     0   
7        0      0          1      0     0      0          0        0     0   
8        0      0          0      0     0      0          0        0     0   
9        0      0          0      0     1      0          0        0     0   
10       0      0          0      0     0      0          0        1     0   
11       1      0          0      0     0      0          0     