<a href="https://colab.research.google.com/github/someshjoyguru/EcoFind/blob/main/Google_Colab_Notebooks/merged.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive for access
from google.colab import drive
drive.mount('/content/drive')

# Change the working directory to '/content/drive/MyDrive/EcoTech'
%cd /content/drive/MyDrive/EcoTech

Mounted at /content/drive
/content/drive/MyDrive/EcoTech


In [2]:
# Import the pandas library for data manipulation
import pandas as pd

# Import the numpy library for numerical operations
import numpy as np

In [3]:
# Load the first .pkl dataset (assuming it's a dictionary)
data1 = pd.read_pickle('products1.pkl')

# Load the second .pkl dataset (assuming it's a dictionary)
data2 = pd.read_pickle('products2.pkl')

# Convert the dictionaries to DataFrames
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

In [4]:
# Rename the 'Image' column to 'image' in DataFrame df1
df1.rename(columns={'Image': 'image'}, inplace=True)

In [5]:
# Define a function to merge DataFrames vertically (by rows) by default
def merge(df1, df2, axis=0):
    # axis=0 for vertical stacking (along rows), axis=1 for horizontal (along columns)
    return pd.concat([df1, df2], axis=axis)

# Merge df1 and df2 vertically (along rows)
merged_df = merge(df1, df2, axis=0)

# Display information about the merged DataFrame
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21552 entries, 0 to 21485
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            21552 non-null  object
 1   url             21552 non-null  object
 2   Classification  21552 non-null  object
 3   tags            21552 non-null  object
 4   ratings         21552 non-null  object
 5   no_of_ratings   21552 non-null  object
 6   image           21552 non-null  object
dtypes: object(7)
memory usage: 1.3+ MB


In [6]:
# Import the CountVectorizer class from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer instance with specific settings
cv = CountVectorizer(max_features=5000, stop_words='english')

In [7]:
# Transform the 'tags' column of merged_df into a feature vector
vector = cv.fit_transform(merged_df['tags']).toarray()

In [8]:
len(cv.get_feature_names_out())

5000

In [9]:
# Import the nltk library
import nltk

# Import the PorterStemmer from nltk
from nltk.stem.porter import PorterStemmer

# Create a PorterStemmer instance
ps = PorterStemmer()

In [10]:
# Define a text stemming function
def stem(text):
    y = []
    # Split the text into words and apply stemming to each word
    for word in text.split():
        y.append(ps.stem(word))
    # Join the stemmed words into a single string and return it
    return " ".join(y)

In [11]:
merged_df['tags'].apply(stem)

0        walnut great grip and extra breathabl make the...
1        green thi lightweight fli rod deliv outstand p...
2        green these tough men heavyduti suspend are ma...
3        antiqu pine antiqu white flint blue slate blue...
4        black dark flint gray natur for strong lightwe...
                               ...                        
21481    charg energy-004 interlock knit hexagon emboss...
21482    cotton yug sadhana cotton yoga mat (brown)- 8m...
21483    matwallah 100%eva eco friendli yoga mat and ex...
21484    evntra anti-slip yoga mat for gym workout, hom...
21485    panchtatava yoga mat cover with adjust shoulde...
Name: tags, Length: 21552, dtype: object

In [12]:
# Import the cosine_similarity function from scikit-learn
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between feature vectors and store it in 'similarity'
similarity = cosine_similarity(vector)

# Display the similarity matrix
similarity

array([[1.        , 0.08829642, 0.05443557, ..., 0.01437993, 0.        ,
        0.        ],
       [0.08829642, 1.        , 0.08569243, ..., 0.01320484, 0.01533028,
        0.01686698],
       [0.05443557, 0.08569243, 1.        , ..., 0.01744481, 0.02025271,
        0.06684848],
       ...,
       [0.01437993, 0.01320484, 0.01744481, ..., 1.        , 0.56175441,
        0.7004728 ],
       [0.        , 0.01533028, 0.02025271, ..., 0.56175441, 1.        ,
        0.62187434],
       [0.        , 0.01686698, 0.06684848, ..., 0.7004728 , 0.62187434,
        1.        ]])

In [13]:
similarity.shape

(21552, 21552)

In [14]:
# Find the index of the product in 'merged_df'
product_index = merged_df[merged_df['name'] == 'USHA RapidMix 500-Watt Copper Motor Mixer Grinder with 3 Jars and 5 Years Warranty(Sea Green/White)'].index[0]

# Calculate the cosine similarity distances for the specified product
distances = similarity[product_index]

# Sort and retrieve the top 5 similar products as a list of tuples
product_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

# Display the list of similar products
product_list

# Get the name of the product at index 44 in 'merged_df'
merged_df.get('name')[44]

44    Organic Flannel Comforter Cover Collection, St...
44    Room Air Purifier and Humidifier Aroma Diffuse...
Name: name, dtype: object

In [15]:
# Define a function to recommend similar products based on a given product name
def recommend(product):
    # Find the index of the given product in 'merged_df'
    product_index = merged_df[merged_df['name'] == product].index[0]

    # Calculate the cosine similarity distances for the specified product
    distances = similarity[product_index]

    # Sort and retrieve the top 5 similar products as a list of tuples
    product_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    # Print the names of the recommended products
    for i in product_list:
        print(merged_df.get('name')[i[0]])


In [16]:
recommend('USHA RapidMix 500-Watt Copper Motor Mixer Grinder with 3 Jars and 5 Years Warranty(Sea Green/White)')

44    Organic Flannel Comforter Cover Collection, St...
44    Room Air Purifier and Humidifier Aroma Diffuse...
Name: name, dtype: object
Kalt Men's Half Sleeves Mandarin Collar Cotton Blend T-Shirt (Dark Grey, White, Green)
B&W Organic- Premium Cotton Polo T-Shirt - Red
24                L.L.Bean Organic Flannel Tunic, Plaid
24    Stylista Fully Automatic Top Load Washing Mach...
Name: name, dtype: object
8       Lakewashed® Organic Cotton Oxford Shirt, Plaid
8    Black + Decker BD BXIR2201IN 2200-Watt Cord & ...
Name: name, dtype: object


In [17]:
merged_df.shape

(21552, 7)

In [18]:
# Import the pickle module
import pickle

# Serialize and save the merged DataFrame as a dictionary to a binary file
pickle.dump(merged_df.to_dict(), open('products.pkl', 'wb'))

# Serialize and save the similarity matrix to a binary file
pickle.dump(similarity, open('similarity.pkl', 'wb'))
