In [1]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/sample30.csv")

In [2]:
import re
# we define a set of common stopwords
stop_words = set([
    "ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once",
    "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for",
    "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is",
    "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until",
    "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were",
    "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above",
    "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before",
    "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves",
    "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now",
    "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself",
    "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my",
    "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"
])


def preprocess_text(text):

    text = text.lower()


    words = re.findall(r'\b[a-z]+\b', text)


    words = [word for word in words if word not in stop_words]

    return ' '.join(words)


data['processed_reviews_text'] = data['reviews_text'].apply(preprocess_text)


data[['reviews_text', 'processed_reviews_text']].head()


Unnamed: 0,reviews_text,processed_reviews_text
0,i love this album. it's very good. more to the...,love album good hip hop side current pop sound...
1,Good flavor. This review was collected as part...,good flavor review collected part promotion
2,Good flavor.,good flavor
3,I read through the reviews on here before look...,read reviews looking buying one couples lubric...
4,My husband bought this gel for us. The gel cau...,husband bought gel us gel caused irritation fe...


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

data = data.dropna(subset=['user_sentiment'])
X_train, X_test, y_train, y_test = train_test_split(
    data['processed_reviews_text'], data['user_sentiment'], test_size=0.2, random_state=42)


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_tfidf, y_train)

y_pred = lr_classifier.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep


(0.924,
 '              precision    recall  f1-score   support\n\n    Negative       0.91      0.33      0.49       653\n    Positive       0.92      1.00      0.96      5347\n\n    accuracy                           0.92      6000\n   macro avg       0.92      0.66      0.72      6000\nweighted avg       0.92      0.92      0.91      6000\n')

In [4]:
from collections import Counter
counter = Counter(data['user_sentiment'])
counter

Counter({'Positive': 26632, 'Negative': 3367})

In [5]:
lr_classifier_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_classifier_balanced.fit(X_train_tfidf, y_train)


y_pred_balanced = lr_classifier_balanced.predict(X_test_tfidf)


accuracy_balanced = accuracy_score(y_test, y_pred_balanced)
classification_rep_balanced = classification_report(y_test, y_pred_balanced)

accuracy_balanced, classification_rep_balanced

(0.9083333333333333,
 '              precision    recall  f1-score   support\n\n    Negative       0.55      0.93      0.69       653\n    Positive       0.99      0.91      0.95      5347\n\n    accuracy                           0.91      6000\n   macro avg       0.77      0.92      0.82      6000\nweighted avg       0.94      0.91      0.92      6000\n')

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1]
}


svm_rbf = SVC(kernel='rbf', class_weight='balanced', max_iter=10000)


grid_search = GridSearchCV(svm_rbf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)


grid_search.fit(X_train_tfidf, y_train)


best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


In [21]:

svm_rbf_classifier = SVC(kernel='rbf', class_weight='balanced', max_iter=10000,C=10,gamma = 0.1)


svm_rbf_classifier.fit(X_train_tfidf, y_train)


y_pred_svm_rbf = svm_rbf_classifier.predict(X_test_tfidf)


accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
classification_rep_svm_rbf = classification_report(y_test, y_pred_svm_rbf)

accuracy_svm_rbf, classification_rep_svm_rbf




(0.939,
 '              precision    recall  f1-score   support\n\n    Negative       0.68      0.84      0.75       653\n    Positive       0.98      0.95      0.97      5347\n\n    accuracy                           0.94      6000\n   macro avg       0.83      0.90      0.86      6000\nweighted avg       0.95      0.94      0.94      6000\n')

In [7]:

interaction_matrix = data.pivot_table(index='reviews_username', columns='id', values='reviews_rating')


interaction_matrix.head()


id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfs0tUilAPnD_xgqN2,AVpfsQoeilAPnD_xgfx5,AVpfshNsLJeJML43CB8q,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpftymALJeJML43CZ6y,AVpfv4TlilAPnD_xhjNS,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,,,,,,,,,,,...,,,,,,,,,,
00sab00,,,,,,,,,,,...,,,,,,,,,,
01impala,,,,,,,,,,,...,,,,,,,,,,
02dakota,,,,,,,,,,,...,,,,,,,,,,
02deuce,,,,,,,,,,,...,,,,,,,,,,


In [8]:
!pip install surprise
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy


interaction_data = interaction_matrix.reset_index().melt(id_vars='reviews_username', var_name='product_id', value_name='rating')

interaction_data.head()



Unnamed: 0,reviews_username,product_id,rating
0,00dog3,AV13O1A8GV-KLJ3akUyj,
1,00sab00,AV13O1A8GV-KLJ3akUyj,
2,01impala,AV13O1A8GV-KLJ3akUyj,
3,02dakota,AV13O1A8GV-KLJ3akUyj,
4,02deuce,AV13O1A8GV-KLJ3akUyj,


In [9]:

interaction_data = interaction_data.dropna()


reader = Reader(rating_scale=(1, 5))
df = Dataset.load_from_df(interaction_data[['reviews_username', 'product_id', 'rating']], reader)

df

<surprise.dataset.DatasetAutoFolds at 0x7e88aaf5c400>

In [10]:

trainset, testset = train_test_split(df, test_size=0.2)


model = SVD()
model.fit(trainset)
predictions = model.test(testset)


rmse = accuracy.rmse(predictions)

rmse

RMSE: 0.7869


0.7869178875417389

In [11]:

data['product_profile'] = data['brand'].astype(str) + ' ' + data['categories'].astype(str) + ' ' + data['manufacturer'].astype(str)


data['product_profile'].head()


0    Universal Music Movies, Music & Books,Music,R&...
1    Lundberg Food,Packaged Foods,Snacks,Crackers,S...
2    Lundberg Food,Packaged Foods,Snacks,Crackers,S...
3    K-Y Personal Care,Medicine Cabinet,Lubricant/S...
4    K-Y Personal Care,Medicine Cabinet,Lubricant/S...
Name: product_profile, dtype: object

In [14]:

from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
product_profiles_matrix = tfidf_vectorizer.fit_transform(data['product_profile'])

def get_similar_products(product_id, top_n=5):

    idx = data[data['id'] == product_id].index[0]


    cosine_similarities = cosine_similarity(product_profiles_matrix[idx], product_profiles_matrix).flatten()


    similar_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]


    similar_products = [(data['id'].iloc[i], cosine_similarities[i]) for i in similar_indices]

    return similar_products


sample_product_id = data['id'].iloc[0]
similar_products = get_similar_products(sample_product_id)

similar_products


[('AVpfR5m0LJeJML436K3W', 0.2564545230677715),
 ('AVpfR5m0LJeJML436K3W', 0.2564545230677715),
 ('AVpfR5m0LJeJML436K3W', 0.2564545230677715),
 ('AVpfR5m0LJeJML436K3W', 0.2564545230677715),
 ('AVpfR5m0LJeJML436K3W', 0.2564545230677715)]

In [16]:



sentiment_mapping = {'Positive': 1, 'Negative': 0}
data['sentiment_value'] = data['user_sentiment'].map(sentiment_mapping)


product_sentiment_scores = data.groupby('id')['sentiment_value'].mean().sort_values(ascending=False)

product_sentiment_scores.head()


id
AV13O1A8GV-KLJ3akUyj    1.0
AVpe_y-I1cnluZ0-bmvh    1.0
AVpe_N91LJeJML43z4jD    1.0
AVpe_5U_ilAPnD_xSrxG    1.0
AVpfbpzd1cnluZ0-kqJV    1.0
Name: sentiment_value, dtype: float64

In [19]:
def recommend_products_with_sentiment(product_id, top_n=5):

    idx = data[data['id'] == product_id].index[0]


    cosine_similarities = cosine_similarity(product_profiles_matrix[idx], product_profiles_matrix).flatten()


    similar_indices = cosine_similarities.argsort()[-top_n*2-1:-1][::-1]

    similar_products = [(data['id'].iloc[i], product_sentiment_scores[data['id'].iloc[i]]) for i in similar_indices]

    sorted_similar_products = sorted(similar_products, key=lambda x: x[1], reverse=True)[:top_n]

    return sorted_similar_products

sample_product_id = data['id'].iloc[0]
recommended_products = recommend_products_with_sentiment(sample_product_id)

recommended_products



[('AVpfR5m0LJeJML436K3W', 0.8970588235294118),
 ('AVpfR5m0LJeJML436K3W', 0.8970588235294118),
 ('AVpfR5m0LJeJML436K3W', 0.8970588235294118),
 ('AVpfR5m0LJeJML436K3W', 0.8970588235294118),
 ('AVpfR5m0LJeJML436K3W', 0.8970588235294118)]