import re
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
sns.set(font_scale=1.4)

In [None]:
!pip install plotly.express

In [None]:
data = pd.read_csv(r'C:\python projects\amazonproductrecom\BigBasket Products.csv').drop('index', axis=1)

In [None]:
data.head(5)

In [None]:
data.shape

In [None]:
info = pd.DataFrame(index=data.columns, columns=['Dtype', 'Unique Numbers'])
info['Dtype'] = data.dtypes
info['Null values'] = data.isna().sum()
info['Unique Numbers'] = data.nunique()
info['Description'] = ['Products bought', 'Products categories', 'Products sub-categories', 'Products brand', 
                       'The price paid', 'The overall price', 'Products type', 'Users rating', 'Description']
info

In [None]:
info['Dtype'].value_counts()

In [None]:
print('Duplicated Values:', data.duplicated().sum())

In [None]:
data.drop('description', axis=1, inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.describe().T.loc['rating']

In [None]:
import matplotlib.pyplot as plt
data['rating'].hist()
plt.title('Rating Distribution')
plt.show()

In [None]:
median_value = data['rating'].median()
data['rating'] = data['rating'].fillna(median_value)

In [None]:
data.dropna(inplace=True)
print('Null Values:', data.isna().sum().sum())

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['discount'] = (data['market_price'] - data['sale_price']) / data['market_price'] * 100
data.head(10)

In [None]:
data.dtypes.value_counts()

In [None]:
data.describe().T

In [None]:
data[['rating', 'discount']].hist(bins=15)
plt.show()

In [None]:
data[['sale_price', 'market_price']].hist(bins=15)
plt.show()

In [None]:
data[['sale_price', 'market_price']].apply(np.log1p).hist()
plt.show()

In [None]:
def top7(col):
    df = data_frame=data[col].value_counts()[:7]
    s = df.sum()
    fig = px.bar(df / s * 100, title='Top 7 Selling ' + col.title() + 's', width=600, height=500)
    fig.update_layout(xaxis_title=col.title(), yaxis_title='Counts')
    fig.update_traces(texttemplate='%{y:.1f}%', textposition='outside')
    fig.show()

In [None]:
import plotly.express as px
top7('product')

In [None]:
import plotly.express as px
top7('category')

In [None]:
import plotly.express as px
top7('sub_category')

In [None]:
import plotly.express as px
top7('brand')

In [None]:
top7('type')

In [None]:
def category_by_target(category, target):
    top_products = data.groupby(category)[target].mean().sort_values(ascending=False).head(7)
    plt.figure(figsize=(10, 6))
    top_products.plot(kind='bar', color='skyblue')
    plt.title('Top 7 Selling ' + category.title().replace('_', ' ') + 's' +
              ' by Mean ' + target.title().replace('_', ' '))
    plt.xlabel(category.title().replace('_', ' '))
    plt.ylabel('Mean ' + target.title().replace('_', ' '))
    plt.xticks(rotation=45, ha='right')
    plt.show()

In [None]:
category_by_target('product', 'sale_price')

In [None]:
category_by_target('category', 'sale_price')

In [None]:
category_by_target('sub_category', 'sale_price')

In [None]:
category_by_target('brand', 'sale_price')

In [None]:
category_by_target('type', 'sale_price')

In [None]:
data['dis_value'] = data['market_price'] - data['sale_price']

In [None]:
category_by_target('product', 'dis_value')

In [None]:
category_by_target('category', 'dis_value')

In [None]:
category_by_target('sub_category', 'dis_value')

In [None]:
category_by_target('type', 'dis_value')

In [None]:
category_by_target('brand', 'dis_value')

In [None]:
data.drop('dis_value', axis=1, inplace=True)

In [None]:
data['popularity_score'] = data['rating'] * (1 / data['sale_price'])

In [None]:
category_by_target('product', 'popularity_score')

In [None]:
category_by_target('category', 'popularity_score')

In [None]:
category_by_target('sub_category', 'popularity_score')

In [None]:
category_by_target('brand', 'popularity_score')

In [None]:
category_by_target('type', 'popularity_score')

In [None]:
correlation_matrix = data.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.xticks(rotation=45, ha='right') 
plt.yticks(rotation=45, ha='right') 
plt.show()


In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2, _, _, _ = stats.chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

In [None]:
numerical_features = data.select_dtypes(include=['float64', 'int64', 'int32'])
categorical_features = data.select_dtypes(include=['object'])

In [None]:
num_features = len(categorical_features.columns)
corr_matrix = pd.DataFrame(np.ones((num_features, num_features)), columns=categorical_features.columns,
                           index=categorical_features.columns)

In [None]:
for i in range(num_features):
    for j in range(i + 1, num_features):
        corr_matrix.iloc[i, j] = cramers_v(categorical_features.iloc[:, i], categorical_features.iloc[:, j])
        corr_matrix.iloc[j, i] = corr_matrix.iloc[i, j]

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Association Matrix')
plt.xticks(rotation=45, ha='right') 
plt.yticks(rotation=45, ha='right') 
plt.show()

In [None]:
data.head(10)

In [None]:
data.dtypes.value_counts()

In [None]:
data['tags'] = data['product'] + ' ' + data['category'] + ' ' + \
               data['sub_category'] + ' ' + data['brand'] + ' ' + data['type']

In [None]:
df = data.copy()

In [None]:
df.drop(['product', 'category', 'sub_category', 'brand', 'type'], axis=1, inplace=True)

In [None]:
df.head(10)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['tags'])
tfidf_matrix.shape

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(['tags'], axis=1))
scaled_features.shape

In [None]:
dense_tfidf_matrix = tfidf_matrix.toarray()
combined_data = np.concatenate((dense_tfidf_matrix, scaled_features), axis=1)
combined_data.shape

In [None]:
cosine_sim = cosine_similarity(combined_data)

In [None]:
cosine_sim.shape

In [None]:
indices = pd.Series(data.index, index=data['product']).drop_duplicates()

In [None]:
def get_recommendations(name):
    idx = indices[name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    product_indices = [i[0] for i in sim_scores]
    return data['product'].iloc[product_indices]


def recommend(name):
    for i in get_recommendations(name):
        print(i)

In [None]:
item = 'Foochka'
recommend(item)

In [None]:
def get_recommendations(name):
    idx = []
    for m in name:
        if type(indices[m]) == np.int64:
            idx.append(indices[m])
        else:
            idx.append(indices[m][0])
    lst = []
    counter = 0
    for i in idx:
        sim_scores = list(enumerate(cosine_sim[i]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        for j in sim_scores:
            if j[0] not in idx:
                lst.append(j)
    # sort
    lst.sort(key=lambda x: x[1], reverse=True)
    l = [i[0] for i in lst]
    product_indices = l
    return data['product'].iloc[product_indices][0:10]


def recommend(name):
    for i in get_recommendations(name):
        print(i)

In [None]:
recommend(['Foochka'])

In [None]:
recommend(['Foochka', 'Papad - Sago'])

In [None]:
recommend(['Foochka', 'Papad - Sago', 'Papad - Fryum'])

In [None]:
def get_recommendations(name, rate):
    idx = []
    for m in name:
        if type(indices[m]) == np.int64:
            idx.append(indices[m])
        else:
            idx.append(indices[m][0])
    lst = []
    counter = 0
    for i in idx:
        sim_scores = list(enumerate(cosine_sim[i]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        
        # rating out of 5
        l = [(ss[0], ss[1] * rate[counter] / 5) for ss in sim_scores]
        counter += 1
        sim_scores = l
        
        for j in sim_scores:
            if j[0] not in idx:
                lst.append(j)
    # sort
    lst.sort(key=lambda x: x[1], reverse=True)
    l = [i[0] for i in lst]
    product_indices = l
    return data['product'].iloc[product_indices][0:10]


def recommend(name, rate):
    for i in get_recommendations(name, rate):
        print(i)

In [None]:
item1 = 'Ashwagandha Tablets'
item2 = 'Foochka'

In [None]:
recommend([item1], [4.5])

In [None]:
recommend([item2], [4.5])

In [None]:
recommend([item1, item2], [5, 5])

In [None]:
recommend([item1, item2], [1, 5])

In [None]:
recommend([item2, item1], [1, 5])

In [None]:
recommend([item1, item2], [2, 3])

clarification :- https://github.com/Mehrab-Kalantari/Amazon-Products-Recommender-System/blob/main/Amazon%20Products%20Recommender%20System.ipynb
                 https://chatgpt.com/share/67adb031-1b18-8003-b3d7-59de9559208e+
               