In [15]:
import pandas as pd
import nltk
import re
from bs4 import BeautifulSoup
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# NLTK indir
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab") #tokenizer iiçin gerekl, model dosyalarını indir
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
df = pd.read_csv("duygu_analizi_amazon_veri_seti.csv") #veri setini yükleme

In [17]:
#metin önişleme temizleme
lemmatizer= WordNetLemmatizer()

def clean_text(text):
  text=BeautifulSoup(text,"html.parser").get_text() #html etiketlerini temizler
  text=text.lower() #büyük küçük harf çevirme
  text = re.sub(r"http\S+", "",text,flags=re.MULTILINE) # URL'leri kalırmak için yapılacak işlemler
  text=re.sub(r"\d+", "",text) #rakamları temizleme
  text=re.sub(r"[^\w\s]", "", text) #özel karakterlerin kaldırılması

  tokens = word_tokenize(text) #Tokenize et (kelimelere ayır)

  stop_words= set(stopwords.words('english')) #Stopwords temizleme
  filtered = [t for t in tokens if t not in stop_words and len(t) > 2]

  lemmatized =[lemmatizer.lemmatize(t) for t in filtered]

  return " ".join(lemmatized)
df["clean"] = df["reviewText"].apply(clean_text)


In [18]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  20000 non-null  object
 1   Positive    20000 non-null  int64 
 2   clean       20000 non-null  object
dtypes: int64(1), object(2)
memory usage: 468.9+ KB


Unnamed: 0,reviewText,Positive,clean
0,This is a one of the best apps acording to a b...,1,one best apps acording bunch people agree bomb...
1,This is a pretty good version of the game for ...,1,pretty good version game free lot different le...
2,this is a really cool game. there are a bunch ...,1,really cool game bunch level find golden egg s...
3,"This is a silly game and can be frustrating, b...",1,silly game frustrating lot fun definitely reco...
4,This is a terrific game on any pad. Hrs of fun...,1,terrific game pad hr fun grandkids love great ...


In [19]:
# train-test split
X=df["clean"]
y= df["Positive"] #gerçek etiketler
X_train, X_test, y_train, y_test = train_test_split( X,y, test_size=0.2, random_state=1)


In [20]:
#TF idf Vektörleşme
tfidf= TfidfVectorizer(max_features=5000) #5000 özellik çıkar
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf= tfidf.transform(X_test)

In [29]:
# karar ağacı model eğitimi
model= DecisionTreeClassifier()
model.fit(X_train_tf, y_train)

# tahmin
y_pred = model.predict(X_test_tf)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")

# değerlendirme
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.8115
Confusion Matrix:
[[ 604  343]
 [ 411 2642]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.64      0.62       947
           1       0.89      0.87      0.88      3053

    accuracy                           0.81      4000
   macro avg       0.74      0.75      0.75      4000
weighted avg       0.82      0.81      0.81      4000



In [28]:
# random forest model eğitimi
model = RandomForestClassifier(n_estimators=200, random_state=1)
model.fit(X_train_tf, y_train)

# tahmin
y_pred = model.predict(X_test_tf)

# doğruluk
print("Accuracy:", accuracy_score(y_test, y_pred))

# değerlendirme
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8765
Confusion Matrix:
[[ 568  379]
 [ 115 2938]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.60      0.70       947
           1       0.89      0.96      0.92      3053

    accuracy                           0.88      4000
   macro avg       0.86      0.78      0.81      4000
weighted avg       0.87      0.88      0.87      4000

