<a href="https://colab.research.google.com/github/santhoshkumaroff/Angular-E-commerce-website/blob/master/proj1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# read csv file
df = pd.read_csv('Standard File.csv')

# Function to find matching descriptions and percentage
def find_best_matches(new_description, df, top_n=5):
    descriptions = df['description'].tolist()
    technical_codes = df['technical_code'].tolist()

    # Add the new description to the list
    descriptions.append(new_description)

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer().fit_transform(descriptions)
    vectors = vectorizer.toarray()

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(vectors[-1:], vectors[:-1]).flatten()

    # Create a DataFrame with technical codes and matching percentages
    similarity_df = pd.DataFrame({
        'description': df['description'],
        'technical_code': df['technical_code'],
        'similarity_percentage': cosine_similarities * 100  # Convert to percentage
    })

    # Sort by similarity percentage in descending order
    similarity_df = similarity_df.sort_values(by='similarity_percentage', ascending=False)

    # Return top N matches
    return similarity_df.head(top_n)

# Example usage:
new_description = "Anchoring-Anchoring "
matches = find_best_matches(new_description, df)
matches

Unnamed: 0,description,technical_code,similarity_percentage
2024,Yard & Marine Operations-Yard Facilitating-Anc...,YMF20,53.134444
10,Anchoring-Anchor Line-Anchoring Chain-Chain-Chain,ANL01,49.776645
0,Anchoring-Anchoring Ancillary-Anchoring Spring...,ANA01,49.383888
11,Anchoring-Anchor Line-Anchoring Polyester Rope...,ANL02,34.704171
1,Anchoring-Anchoring Ancillary-Anchor Line Acce...,ANA03,33.74251


In [None]:
def jaccard_similarity(desc1, desc2):
    set1 = set(desc1.lower().split())
    set2 = set(desc2.lower().split())

    # Calculate Jaccard similarity
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    similarity = len(intersection) / len(union) * 100
    return similarity

def find_best_matches_jaccard(new_description, df, top_n=5):
    similarities = []

    for index, row in df.iterrows():
        similarity = jaccard_similarity(new_description, row['description'])
        similarities.append((row['description'], row['technical_code'], similarity))

    similarity_df = pd.DataFrame(similarities, columns=['description', 'technical_code', 'similarity_percentage'])

    similarity_df = similarity_df.sort_values(by='similarity_percentage', ascending=False)

    return similarity_df.head(top_n)

new_description = "Anchoring-Anchoring Ancillary-Anchoring Spring"
matches_jaccard = find_best_matches_jaccard(new_description, df)
matches_jaccard


Unnamed: 0,description,technical_code,similarity_percentage
0,Anchoring-Anchoring Ancillary-Anchoring Spring...,ANA01,60.0
2,Anchoring-Anchoring Ancillary-Anchor Clump Wei...,ANA04,14.285714
1,Anchoring-Anchoring Ancillary-Anchor Line Acce...,ANA03,14.285714
2222,#NAME?,BBC02,0.0
1484,Structural-Structural Steel-Secondary Steel - ...,STA05,0.0


In [None]:
# pip install Levenshtein

In [None]:
import Levenshtein

# Function to calculate Levenshtein similarity percentage
def levenshtein_similarity(desc1, desc2):
    distance = Levenshtein.distance(desc1, desc2)
    max_len = max(len(desc1), len(desc2))

    similarity = (1 - distance / max_len) * 100  # Convert to percentage
    return similarity

# Function to find best matching descriptions using Levenshtein distance
def find_best_matches_levenshtein(new_description, df, top_n=5):
    similarities = []

    # Compare new description with each description in the dataframe
    for index, row in df.iterrows():
        similarity = levenshtein_similarity(new_description, row['description'])
        similarities.append((row['description'], row['technical_code'], similarity))

    # Convert to a DataFrame for better visualization
    similarity_df = pd.DataFrame(similarities, columns=['description', 'technical_code', 'similarity_percentage'])

    # Sort by similarity percentage in descending order
    similarity_df = similarity_df.sort_values(by='similarity_percentage', ascending=False)

    # Return top N matches
    return similarity_df.head(top_n)

# Example usage:

matches_levenshtein = find_best_matches_levenshtein(new_description, df)
matches_levenshtein


ModuleNotFoundError: No module named 'Levenshtein'

In [None]:
import pandas as pd

# Load the Excel file
file_path = 'MASTER FILE.xlsx'  # Replace with your actual file path
xls = pd.ExcelFile(file_path)

# Load the sheet where your data resides
df = pd.read_excel(xls, sheet_name='FINAL')
df.columns
# Extract the columns 'Description' and 'Technical Class' (replace with actual column names)
df_filtered = df[['Description', 'Technical  Class']]  # Adjust column names if necessary

# Save the filtered data to a new Excel or CSV file
df_filtered.to_excel('filtered_data.xlsx', index=False)  # For Excel
# or
df_filtered.to_csv('filtered_data.csv', index=False)  # For CSV

print("Filtered dataset saved successfully!")


In [None]:
df = pd.read_csv('filtered_data.csv')
df

In [None]:
import pandas as pd
file_path = 'filtered_data.csv'
df = pd.read_csv(file_path)
words_to_remove = ['FOR', ',']
df['Description'] = df['Description'].replace(words_to_remove, '', regex=True)
df['Description'] = df['Description'].str.strip()
df.to_csv('cleaned_filtered_data.csv', index=False)
df.to_excel('cleaned_filtered_data.xlsx', index=False)

In [None]:
df