In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
import string
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Simay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Simay\AppData\Roaming\nltk_data...


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

In [66]:
df = pd.read_csv('fashionnova_reviews.csv')
df = df.iloc[:25000,1:]

In [67]:
df.head()

Unnamed: 0,Profile Link,Country,Review Count,Review Date,Rating,Review Title,Review Text,Date of Experience
0,/users/66c78240087b6269ffbcb5fb,US,1 review,2024-08-22T20:24:02.000Z,Rated 5 out of 5 stars,I love ordering from fashion nova,I love ordering from fashion nova. The clothes...,"August 22, 2024"
1,/users/6618fdb53d4198001210cbe7,VG,3 reviews,2024-08-21T05:43:11.000Z,Rated 5 out of 5 stars,Top tier content for fashion nova,Always amazing clothes and the fast shipping i...,"August 18, 2024"
2,/users/64e9595206be1a001244ff73,US,3 reviews,2024-08-21T17:09:14.000Z,Rated 5 out of 5 stars,Prices and quality of products are…,Prices and quality of products are GREAT Would...,"August 21, 2024"
3,/users/66c58ad1c6ab36352a08f57a,US,1 review,2024-08-21T08:36:03.000Z,Rated 5 out of 5 stars,Great customer service,Great customer service. I was helped until the...,"August 20, 2024"
4,/users/60ad4b6ef3788e001adbb8e3,US,5 reviews,2024-08-22T00:46:16.000Z,Rated 3 out of 5 stars,False advertising,Disappointing experience. You don’t live up to...,"August 21, 2024"


In [69]:
stopwords_list = stopwords.words('english')

df = df.dropna()

df['review_cleaned'] = df['Review Text'].str.replace('[^a-zA-Z0-9 ]', '', regex=True)  # regex=True, eski versiyonlarda gerekebilir
df['review_cleaned'] = df['review_cleaned'].str.lower()
df['review_cleaned'] = df['review_cleaned'].str.split(' ')
df['review_cleaned'] = df['review_cleaned'].apply(lambda x: [item for item in x if item not in stopwords_list])

In [70]:
df['review_cleaned'].head()

0    [love, ordering, fashion, nova, clothes, good,...
1    [always, amazing, clothes, fast, shipping, rea...
2    [prices, quality, products, great, would, love...
3    [great, customer, service, helped, fashion, no...
4    [disappointing, experience, dont, live, advert...
Name: review_cleaned, dtype: object

In [71]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

df['stemmed'] = df['review_cleaned'].apply(lambda x: [stemmer.stem(y) for y in x])
df.stemmed = df.stemmed.apply(' '.join)

In [78]:
import re

# Rating verilerini sayısal formata dönüştürme fonksiyonu
def extract_star_rating(rating_str):
    match = re.search(r'Rated (\d) out of 5 stars', rating_str)
    if match:
        return int(match.group(1))
    else:
        return None

# Rating sütununu dönüştürme
df['Star Count'] = df['Rating'].apply(extract_star_rating)

# Boş değerleri kaldırma
df = df.dropna(subset=['Star Count'])

# İsterseniz Rating sütununu kaldırabilir ve yeni Star Count sütununu kullanabilirsiniz
df = df.drop(columns=['Rating'])

# Star Count sütununun veri tipini kontrol etme
print(df['Star Count'].dtype)
print(df['Star Count'].head(10))

int64
0    5
1    5
2    5
3    5
4    3
5    5
6    5
7    5
8    5
9    3
Name: Star Count, dtype: int64


In [80]:
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(df.stemmed).toarray()
y = df['Star Count']

In [82]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [81]:
y

0        5
1        5
2        5
3        5
4        3
        ..
24995    5
24996    5
24997    3
24998    5
24999    5
Name: Star Count, Length: 24991, dtype: int64

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [84]:
#Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)

In [85]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.745635305528613
[[ 467   24   93   12  108]
 [ 122   15   65   12   60]
 [ 140   23  166   73  169]
 [  61    6   59  132  684]
 [ 101   13   71  202 5370]]
              precision    recall  f1-score   support

           1       0.52      0.66      0.59       704
           2       0.19      0.05      0.08       274
           3       0.37      0.29      0.32       571
           4       0.31      0.14      0.19       942
           5       0.84      0.93      0.88      5757

    accuracy                           0.75      8248
   macro avg       0.44      0.42      0.41      8248
weighted avg       0.70      0.75      0.71      8248



In [86]:
#Logistik Regression
lgr = LogisticRegression()
lgr.fit(X_train, y_train)
y_pred = lgr.predict(X_test)

In [87]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7547284190106692
[[ 401   50   73   18  162]
 [  94   25   58   10   87]
 [ 119   35  142   57  218]
 [  32   14   43   92  761]
 [  25    8   45  114 5565]]
              precision    recall  f1-score   support

           1       0.60      0.57      0.58       704
           2       0.19      0.09      0.12       274
           3       0.39      0.25      0.30       571
           4       0.32      0.10      0.15       942
           5       0.82      0.97      0.89      5757

    accuracy                           0.75      8248
   macro avg       0.46      0.39      0.41      8248
weighted avg       0.69      0.75      0.71      8248



Yüksek precision, modelin düşük yanlış pozitif oranına sahip olduğunu gösterir. <br><br>
Recall (Duyarlılık): Gerçek pozitiflerin ne kadarının doğru tahmin edildiğini gösterir. Yüksek recall, modelin düşük yanlış negatif oranına sahip olduğunu gösterir. <br><br>
F1-Score (F1-Skoru): Precision ve recall'in harmonik ortalamasıdır. Dengeli bir performans ölçüsüdür ve her iki metriği de dikkate alır. <br><br>
Support (Destek): Her sınıftaki gerçek örnek sayısını belirtir.

In [88]:
#SGD Classifier
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

In [89]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7546071774975752
[[ 411   40   65    9  179]
 [ 105   18   49    9   93]
 [ 131   31  134   28  247]
 [  41   12   55   52  782]
 [  35   11   41   61 5609]]
              precision    recall  f1-score   support

           1       0.57      0.58      0.58       704
           2       0.16      0.07      0.09       274
           3       0.39      0.23      0.29       571
           4       0.33      0.06      0.09       942
           5       0.81      0.97      0.89      5757

    accuracy                           0.75      8248
   macro avg       0.45      0.38      0.39      8248
weighted avg       0.68      0.75      0.70      8248



En yüksek accuracy değeri Logistic Regression’a ait.
