In [15]:
import os
import re
import zipfile
import pandas as pd
import numpy as np
import kagglehub
import nltk
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
df = pd.read_csv('IMDB Dataset.csv')
print("Veri Setinin İlk 5 Satırı:")
print(df.head())

print("\nVeri Seti Bilgisi:")
df.info()

print("\nDuygu Dağılımı:")
print(df['sentiment'].value_counts())

Veri Setinin İlk 5 Satırı:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Veri Seti Bilgisi:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Duygu Dağılımı:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [17]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

def clean_text(text):

    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 3. Convert to lowercase
    text = text.lower()

    # 4. Split into words and remove stopwords
    words = text.split()
    words = [w for w in words if w not in stop_words]

    # 5. Join words back into a single string
    return ' '.join(words)

df['cleaned_review'] = df['review'].apply(clean_text)
print(df[['review', 'cleaned_review']].iloc[0].values)

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [18]:
X = df['cleaned_review']
Y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

Data split into 40000 training samples and 10000 testing samples.


In [24]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

# fit the vectorizer on the training data and transform it
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# transform the test data using the already-fitted vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [25]:
# initialize and train the model
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_train_tfidf, y_train)

# make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Accuracy: 0.8899

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.87      0.89      4961
    positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Confusion Matrix:
[[4330  631]
 [ 470 4569]]


In [26]:
def predict_sentiment(new_review):
  cleaned_review = clean_text(new_review)
  vectorized_review = tfidf_vectorizer.transform([cleaned_review])

  # Predict the sentiment using the trained model
  prediction = model.predict(vectorized_review)

  # Predict the probabilities
  probability = model.predict_proba(vectorized_review)

  print(f"\nReview: '{new_review}'")
  if prediction[0] == 'positive':
        print(f"--> Predicted Sentiment: POSITIVE (Confidence: {probability[0][1]:.2%})")
  else:
        print(f"--> Predicted Sentiment: NEGATIVE (Confidence: {probability[0][0]:.2%})")

In [27]:
# Example
new_review = "This movie was absolutely fantastic! I loved every moment of it."
predict_sentiment(new_review)

new_review = "This movie was terrible and boring."
predict_sentiment(new_review)


Review: 'This movie was absolutely fantastic! I loved every moment of it.'
--> Predicted Sentiment: POSITIVE (Confidence: 95.82%)

Review: 'This movie was terrible and boring.'
--> Predicted Sentiment: NEGATIVE (Confidence: 99.87%)


In [28]:
print("\n--- Testing with new reviews ---")
predict_sentiment("This movie was absolutely brilliant! The storyline was engaging and the actors were perfect.")
predict_sentiment("A complete waste of money and time. The plot was predictable and the acting was horrendous.")
predict_sentiment("It was an okay movie, not great but not bad either.")


--- Testing with new reviews ---

Review: 'This movie was absolutely brilliant! The storyline was engaging and the actors were perfect.'
--> Predicted Sentiment: POSITIVE (Confidence: 95.40%)

Review: 'A complete waste of money and time. The plot was predictable and the acting was horrendous.'
--> Predicted Sentiment: NEGATIVE (Confidence: 99.79%)

Review: 'It was an okay movie, not great but not bad either.'
--> Predicted Sentiment: NEGATIVE (Confidence: 83.97%)
