In [1]:
import requests
import tarfile
import pandas as pd
import os

def get_imdb_dataframe():
    """
    Downloads and extracts the IMDB dataset from a given URL and returns it as a pandas DataFrame.

    The IMDB dataset contains 50,000 movie reviews for natural language processing or text analytics. 
    This dataset is used for binary sentiment classification and includes 25,000 highly polar movie reviews 
    for training and 25,000 for testing. The goal is to predict the number of positive and negative reviews 
    using classification or deep learning algorithms.

    Returns:
        pd.DataFrame: DataFrame containing the IMDB dataset.
    """
    # URL of the dataset
    url = 'https://github.com/pruhlo/data_ML/raw/master/IMDB_Dataset.tar.xz'
    
    # Download the tar.xz file
    response = requests.get(url, stream=True)
    tar_xz_path = 'IMDB_Dataset.tar.xz'
    
    with open(tar_xz_path, 'wb') as file:
        file.write(response.content)
    
    # Extract the tar.xz file
    with tarfile.open(tar_xz_path, 'r:xz') as tar:
        tar.extractall()

    # Assuming the CSV file is named 'IMDB Dataset.csv' inside the tar.xz archive
    csv_file_path = 'IMDB Dataset.csv'
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Clean up the downloaded and extracted files
    os.remove(tar_xz_path)
    os.remove(csv_file_path)
    
    return df

# Usage example
df = get_imdb_dataframe()

  from pandas.core import (


In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
def preprocess_text(text):
    """
    Preprocess the text using spaCy for tokenization and lemmatization.
    """
    doc = nlp(text)
    # Remove stop words and punctuation, and lemmatize the text 
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])


In [9]:
import spacy
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

df['processed_review'] = df['review'].apply(preprocess_text)

In [15]:
import re

def remove_html_tags(text: str) -> str:
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

df['edited_review'] = df['processed_review'].apply(remove_html_tags)

In [19]:
df

Unnamed: 0,review,sentiment,processed_review,edited_review
0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch 1 oz episode hook right...,reviewer mention watch 1 oz episode hook right...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production < br /><br />the f...,wonderful little production the film technique...
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically family little boy Jake think zombie ...,basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei love Time money visually stunnin...,Petter Mattei love Time money visually stunnin...
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,think movie right good job creative original e...,think movie right good job creative original e...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad act idiotic directin...,bad plot bad dialogue bad act idiotic directin...
49997,I am a Catholic taught in parochial elementary...,negative,Catholic teach parochial elementary school nun...,Catholic teach parochial elementary school nun...
49998,I'm going to have to disagree with the previou...,negative,go disagree previous comment Maltin second rat...,go disagree previous comment Maltin second rat...


In [17]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin

def classify_reviews(df):
    # Apply spaCy preprocessing through the custom transformer in the pipeline
        # Split data into training and testing sets
    X = df['edited_review']  # Original reviews without manual preprocessing
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Bag of Words (BOW) model
    bow_pipeline = Pipeline([
        ('vect', CountVectorizer()),  # Convert text to a matrix of token counts
        ('clf', MultinomialNB())      # Multinomial Naive Bayes classifier
    ])

    # TF-IDF model
    tfidf_pipeline = Pipeline([
        ('vect', TfidfVectorizer()),  # Convert text to a matrix of TF-IDF features
        ('clf', MultinomialNB())      # Multinomial Naive Bayes classifier
    ])

    # Train and evaluate Bag of Words model
    bow_pipeline.fit(X_train, y_train)
    bow_predictions = bow_pipeline.predict(X_test)
    bow_accuracy = metrics.accuracy_score(y_test, bow_predictions)
    bow_conf_matrix = metrics.confusion_matrix(y_test, bow_predictions)
    print("Bag of Words Model")
    print(f"Accuracy: {bow_accuracy:.4f}")
    print("Confusion Matrix:")
    print(bow_conf_matrix)

    # Train and evaluate TF-IDF model
    tfidf_pipeline.fit(X_train, y_train)
    tfidf_predictions = tfidf_pipeline.predict(X_test)
    tfidf_accuracy = metrics.accuracy_score(y_test, tfidf_predictions)
    tfidf_conf_matrix = metrics.confusion_matrix(y_test, tfidf_predictions)
    print("\nTF-IDF Model")
    print(f"Accuracy: {tfidf_accuracy:.4f}")
    print("Confusion Matrix:")
    print(tfidf_conf_matrix)



classify_reviews(df)

# HW13:
""""Bag of Words Model
Accuracy: 0.8485
Confusion Matrix:
[[6522  889]
 [1383 6206]]

TF-IDF Model
Accuracy: 0.8609
Confusion Matrix:
[[6620  791]
 [1296 6293]]"""

Bag of Words Model
Accuracy: 0.8531
Confusion Matrix:
[[6453  958]
 [1245 6344]]

TF-IDF Model
Accuracy: 0.8596
Confusion Matrix:
[[6499  912]
 [1194 6395]]


'"Bag of Words Model\nAccuracy: 0.8485\nConfusion Matrix:\n[[6522  889]\n [1383 6206]]\n\nTF-IDF Model\nAccuracy: 0.8609\nConfusion Matrix:\n[[6620  791]\n [1296 6293]]'

In [21]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

def classify_reviews(df):
    # Split data into training and testing sets
    X = df['edited_review']  # Original reviews without manual preprocessing
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Bag of Words (BOW) model with Logistic Regression
    bow_pipeline = Pipeline([
        ('vect', CountVectorizer()),  # Convert text to a matrix of token counts
        ('clf', LogisticRegression(max_iter=1000, random_state=42))  # Logistic Regression classifier
    ])

    # TF-IDF model with Logistic Regression
    tfidf_pipeline = Pipeline([
        ('vect', TfidfVectorizer()),  # Convert text to a matrix of TF-IDF features
        ('clf', LogisticRegression(max_iter=1000, random_state=42))  # Logistic Regression classifier
    ])

    # Train and evaluate Bag of Words model
    bow_pipeline.fit(X_train, y_train)
    bow_predictions = bow_pipeline.predict(X_test)
    bow_accuracy = metrics.accuracy_score(y_test, bow_predictions)
    bow_conf_matrix = metrics.confusion_matrix(y_test, bow_predictions)
    print("Bag of Words Model with Logistic Regression")
    print(f"Accuracy: {bow_accuracy:.4f}")
    print("Confusion Matrix:")
    print(bow_conf_matrix)

    # Train and evaluate TF-IDF model
    tfidf_pipeline.fit(X_train, y_train)
    tfidf_predictions = tfidf_pipeline.predict(X_test)
    tfidf_accuracy = metrics.accuracy_score(y_test, tfidf_predictions)
    tfidf_conf_matrix = metrics.confusion_matrix(y_test, tfidf_predictions)
    print("\nTF-IDF Model with Logistic Regression")
    print(f"Accuracy: {tfidf_accuracy:.4f}")
    print("Confusion Matrix:")
    print(tfidf_conf_matrix)

# Пример вызова функции (предполагается, что df уже определен)
classify_reviews(df)


Bag of Words Model with Logistic Regression
Accuracy: 0.8771
Confusion Matrix:
[[6439  972]
 [ 871 6718]]

TF-IDF Model with Logistic Regression
Accuracy: 0.8905
Confusion Matrix:
[[6490  921]
 [ 722 6867]]
