In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import chardet

In [63]:
features = {
    "Datasets/CoffeeMachineData.csv":{
            "numerical":['Capacity (Liters)','Power Output (kW)','Price (USD)','Brew Time (Minutes)','Weight (kg)'],
            "categorical":['Machine Name','Type','Ease of Use','Brew Quality'],
        },
    "Datasets/Fruit_Veg_Processing_Machines.csv": {
            "numerical":['Speed (kg/hr)', 'Power Input (kW)', 'Efficiency (%)','Price (USD)'],
            "categorical":['Machine Name', 'MType', 'MachineMaterial','Manufacturer'],
        },
    "Datasets/grain_machinery_data.csv":{
            "numerical":['Capacity (tons/hour)', 'Power Output (kW)', 'Price (USD)'],
            "categorical":['Grain', 'Machine Name', 'Grain Manufacturer'],
        },
    "Datasets/Ice_Cream_Makers.csv":{
            "numerical":['Noise Levels','Power(W)','Price (USD)'],
            "categorical":['Machine Name', 'Capacity', 'Batch Output'],
        },
    "Datasets/juice_makers.csv":
        {
            "numerical":[
                        'Motor Power (W)', 'Juicing Speed (RPM)', 'Noise Level (dB)',
                        'Customer Rating', 'Number of Reviews', 'Price (USD)',
                        ],
            "categorical":['Machine Name', 'Material','Type of Juicer'],
        },    
}

In [64]:
datasets = ["Datasets/CoffeeMachineData.csv",
            "Datasets/Fruit_Veg_Processing_Machines.csv",
            "Datasets/grain_machinery_data.csv",
            "Datasets/Ice_Cream_Makers.csv",
            "Datasets/juice_makers.csv",]

In [65]:
def encoding_detect(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

def calc_combined_feature_matrix(file_path, user_input):
    df = pd.read_csv(file_path, encoding=encoding_detect(file_path))   
    categorical_cols = features[file_path]["categorical"]
    numerical_cols = features[file_path]["numerical"] 
    
    df[categorical_cols] = df[categorical_cols].fillna('')
    for col in numerical_cols:
        if col not in df.columns:
            raise ValueError(f"Numerical column '{col}' is missing in the dataset.")
    
    scaler = MinMaxScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    df['combined_features'] = df[categorical_cols].apply(lambda x: ' '.join(x).lower(), axis=1)
    tfidf = TfidfVectorizer(stop_words='english')
    text_features = tfidf.fit_transform(df['combined_features'])  
    combined_features_matrix = np.hstack([
        text_features.toarray(), 
        df[numerical_cols].values
    ])
    return combined_features_matrix

In [66]:
def findr(file_path, user_input, findr_col, n_recom = 20): 
    df = pd.read_csv(file_path, encoding=encoding_detect(file_path))
    combined_features_matrix = calc_combined_feature_matrix(file_path, user_input)
    cosine_sim = cosine_similarity(combined_features_matrix)
    # print("Cosine Shape: ",cosine_sim.shape)
    # print("Cosine Similarity: \n",cosine_sim)
    column = findr_col
    matching_rows = df[df[column].str.contains(user_input, case=False, na=False, regex=False)]
    if matching_rows.empty:
        return f"Machine '{user_input}' not found in the dataset."
    
    idx = matching_rows.index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    print("sim score shape: ",len(sim_scores))
    print("sim scores: \n",sim_scores)
    sim_indices = [i[0] for i in sim_scores[1:n_recom+1]]
    
    df = pd.read_csv(file_path,encoding=encoding_detect(file_path))
    data = df.iloc[sim_indices]
    return data    

In [67]:
# user_input = 'drip'
# findr_col = 'Type'
# num_recommendations = 50
# recommendations = findr(datasets[0], user_input, findr_col,n_recom=num_recommendations) # incomplete - take datasets index from 
# recommendations.head()

In [68]:
# drip_recommendations = recommendations[recommendations['Type'].str.contains('Drip', case = False, na = False)] 

In [69]:
def precision(recommendations, drip_recommendations):
    precision = len(drip_recommendations) / len(recommendations)
    return precision

def coverage(recommendations, dataset_path): 
    df = pd.read_csv(dataset_path, encoding=encoding_detect(dataset_path))
    total_unique_items = len(df['Machine Name'].unique())
    unique_items_in_recommendations = len(recommendations['Machine Name'].unique())
    coverage = unique_items_in_recommendations / total_unique_items
    return coverage

# def divesity_score(recommendations, file_path, user_input):
#     n = len(recommendations)
#     combined_features_matrix = calc_combined_feature_matrix(file_path, user_input)
#     recommended_indices = recommendations.index
#     recommended_features = combined_features_matrix[recommended_indices]
#     pairwise_similarities = cosine_similarity(recommended_features)
#     total_pairs = n * (n - 1) / 2
#     average_similarity = (np.sum(pairwise_similarities) / total_pairs) if total_pairs > 0 else 0
#     diversity_score = 1 - average_similarity
#     return diversity_score    

In [70]:
def evaluate_recommendations(dataset_path, user_input, findr_col, n_recom=20):
    if dataset_path not in features:
        raise ValueError(f"Dataset {dataset_path} not found in features dictionary")
    
    categorical_cols = features[dataset_path]["categorical"]
    if findr_col not in categorical_cols:
        raise ValueError(f"Search column {findr_col} not found in categorical columns for {dataset_path}")
    
    # Get recommendations
    recommendations = findr(dataset_path, user_input, findr_col, n_recom)
    
    if isinstance(recommendations, str):  # Error message returned
        return {
            "error": recommendations,
            "metrics": None
        }
    
    # 1. Precision
    matching_recommendations = recommendations[recommendations[findr_col].str.contains(user_input, case=False, na=False, regex=False)]
    prec = precision(recommendations, matching_recommendations)
    
    # 2. Coverage
    cov = coverage(recommendations, dataset_path)
    
    # 3. Diversity
    # div = divesity_score(recommendations, dataset_path, user_input)
    
    results = {
        "metrics": {
            "precision": prec,
            "coverage": cov,
            # "diversity": div
        },
        "details": {
            "dataset": dataset_path,
            "search_column": findr_col,
            "user_input": user_input,
            "total_recommendations": len(recommendations),
            "matching_recommendations": len(matching_recommendations),
            "unique_recommendations": len(recommendations['Machine Name'].unique()),
            "total_catalog_items": len(pd.read_csv(dataset_path, encoding=encoding_detect(dataset_path))['Machine Name'].unique())
        }
    }
    return results

In [71]:
def test_recommendations():
    test_cases = [
        # Coffee machines
        {"dataset": datasets[0], "input": "drip", "col": "Type"},
        {"dataset": datasets[0], "input": "espresso", "col": "Type"},
        
        # Fruit/Veg equipment
        {"dataset": datasets[1], "input": "juicer", "col": "MType"},
        
        # Grain processing
        {"dataset": datasets[2], "input": "wheat", "col": "Grain"},
        
        # Ice cream machines
        {"dataset": datasets[3], "input": "commercial", "col": "Machine Name"},
        
        # Juice machines
        {"dataset": datasets[4], "input": "centrifugal", "col": "Type of Juicer"}
    ]
    
    for case in test_cases:
        print(f"\nTesting {case['dataset']} with input '{case['input']}' in column '{case['col']}'")
        try:
            result = evaluate_recommendations(case['dataset'], case['input'], case['col'])
            if result['metrics']:
                print(f"Precision: {result['metrics']['precision']:.4f}")
                print(f"Coverage: {result['metrics']['coverage']:.4f}")
                # print(f"Diversity: {result['metrics']['diversity']:.4f}")
                print(f"Total recommendations: {result['details']['total_recommendations']}")
            else:
                print(f"Error: {result['error']}")
        except Exception as e:
            print(f"Error: {str(e)}")

In [72]:
result = evaluate_recommendations(datasets[0], "drip", "Type", n_recom=30)
print("Results for Fruit/Veg Equipment - 'cutter':")
if result['metrics']:
    print(f"Precision: {result['metrics']['precision']:.4f}")
    print(f"Coverage: {result['metrics']['coverage']:.4f}")
    print("\nDetails:")
    for key, value in result['details'].items():
        print(f"{key}: {value}")

sim score shape:  109
sim scores: 
 [(2, np.float64(0.9999999999999998)), (87, np.float64(0.43349882610276314)), (6, np.float64(0.3976334958777079)), (106, np.float64(0.39106761446240224)), (30, np.float64(0.36381577816501975)), (102, np.float64(0.35684076186800007)), (103, np.float64(0.3534838240887347)), (105, np.float64(0.34806855673007686)), (63, np.float64(0.34456088178495026)), (66, np.float64(0.33306432103885236)), (101, np.float64(0.33226157154543234)), (86, np.float64(0.32481420222576096)), (53, np.float64(0.3223295514860177)), (20, np.float64(0.32211166327852725)), (19, np.float64(0.314454336568684)), (82, np.float64(0.3124425012433271)), (81, np.float64(0.31107303592094926)), (96, np.float64(0.2978507846063059)), (90, np.float64(0.2942908184737521)), (21, np.float64(0.2932148511872739)), (95, np.float64(0.2914756717630736)), (69, np.float64(0.28956856359440775)), (92, np.float64(0.2890230358258039)), (93, np.float64(0.2885356169876134)), (60, np.float64(0.28699894421560246))

In [73]:
# pre = precision(recommendations, drip_recommendations)
# cov = coverage(recommendations, datasets[0]) 
# div = divesity_score(recommendations, datasets[0], user_input)  


# print(f"Precision: {pre:.4f}")
# print(f"Coverage: {cov:.4f}")
# print(f"Diversity: {div:.4f}")