In [9]:
import requests
import tarfile
import pandas as pd
import os

def get_imdb_dataframe():
    """
    Downloads and extracts the IMDB dataset from a given URL and returns it as a pandas DataFrame.

    The IMDB dataset contains 50,000 movie reviews for natural language processing or text analytics. 
    This dataset is used for binary sentiment classification and includes 25,000 highly polar movie reviews 
    for training and 25,000 for testing. The goal is to predict the number of positive and negative reviews 
    using classification or deep learning algorithms.

    Returns:
        pd.DataFrame: DataFrame containing the IMDB dataset.
    """
    # URL of the dataset
    url = 'https://github.com/pruhlo/data_ML/raw/master/IMDB_Dataset.tar.xz'
    
    # Download the tar.xz file
    response = requests.get(url, stream=True)
    tar_xz_path = 'IMDB_Dataset.tar.xz'
    
    with open(tar_xz_path, 'wb') as file:
        file.write(response.content)
    
    # Extract the tar.xz file
    with tarfile.open(tar_xz_path, 'r:xz') as tar:
        tar.extractall()

    # Assuming the CSV file is named 'IMDB Dataset.csv' inside the tar.xz archive
    csv_file_path = 'IMDB Dataset.csv'
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Clean up the downloaded and extracted files
    os.remove(tar_xz_path)
    os.remove(csv_file_path)
    
    return df

# Usage example
df = get_imdb_dataframe()

In [10]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

def classify_reviews(df):
    # Split data into training and testing sets
    X = df['review']
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Bag of Words (BOW) model
    bow_pipeline = Pipeline([
        ('vect', CountVectorizer()),  # Convert text to a matrix of token counts
        ('clf', MultinomialNB())      # Multinomial Naive Bayes classifier
    ])

    # TF-IDF model
    tfidf_pipeline = Pipeline([
        ('vect', TfidfVectorizer()),  # Convert text to a matrix of TF-IDF features
        ('clf', MultinomialNB())      # Multinomial Naive Bayes classifier
    ])

    # Train and evaluate Bag of Words model
    bow_pipeline.fit(X_train, y_train)
    bow_predictions = bow_pipeline.predict(X_test)
    bow_accuracy = metrics.accuracy_score(y_test, bow_predictions)
    bow_conf_matrix = metrics.confusion_matrix(y_test, bow_predictions)
    print("Bag of Words Model")
    print(f"Accuracy: {bow_accuracy:.4f}")
    print("Confusion Matrix:")
    print(bow_conf_matrix)

    # Train and evaluate TF-IDF model
    tfidf_pipeline.fit(X_train, y_train)
    tfidf_predictions = tfidf_pipeline.predict(X_test)
    tfidf_accuracy = metrics.accuracy_score(y_test, tfidf_predictions)
    tfidf_conf_matrix = metrics.confusion_matrix(y_test, tfidf_predictions)
    print("\nTF-IDF Model")
    print(f"Accuracy: {tfidf_accuracy:.4f}")
    print("Confusion Matrix:")
    print(tfidf_conf_matrix)

df = get_imdb_dataframe()
classify_reviews(df)


Bag of Words Model
Accuracy: 0.8485
Confusion Matrix:
[[6522  889]
 [1383 6206]]

TF-IDF Model
Accuracy: 0.8609
Confusion Matrix:
[[6620  791]
 [1296 6293]]


In [21]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 222.6 kB/s eta 0:00:58
     --------------------------------------- 0.0/12.8 MB 281.8 kB/s eta 0:00:46
     --------------------------------------- 0.1/12.8 MB 479.1 kB/s eta 0:00:27
      --------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.6/12.8 MB 2.2 MB/s eta 0:00:06
     -- ------------------------------------- 0.7/12.8 MB 2.4 MB/s eta 0:00:05
     --- ------------------------------------ 1.0/12.8 MB 2.8 MB/s eta 0:00:05
     ----- ---------------------------------- 

In [25]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

df['processed_review'] = df['review'].apply(preprocess_text)

def preprocess_text(text):
    """
    Preprocess the text using spaCy for tokenization and lemmatization.
    """
    doc = nlp(text)
    # Remove stop words and punctuation, and lemmatize the text
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

df['processed_review'] = df['review'].apply(preprocess_text)

def classify_reviews(df):
    # Apply preprocessing
    # df['processed_review'] = df['review'].apply(preprocess_text)
    
    # Split data into training and testing sets
    X = df['processed_review']
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Bag of Words (BOW) model
    bow_pipeline = Pipeline([
        ('vect', CountVectorizer()),  # Convert text to a matrix of token counts
        ('clf', MultinomialNB())      # Multinomial Naive Bayes classifier
    ])

    # TF-IDF model
    tfidf_pipeline = Pipeline([
        ('vect', TfidfVectorizer()),  # Convert text to a matrix of TF-IDF features
        ('clf', MultinomialNB())      # Multinomial Naive Bayes classifier
    ])

    # Train and evaluate Bag of Words model
    bow_pipeline.fit(X_train, y_train)
    bow_predictions = bow_pipeline.predict(X_test)
    bow_accuracy = metrics.accuracy_score(y_test, bow_predictions)
    bow_conf_matrix = metrics.confusion_matrix(y_test, bow_predictions)
    print("Bag of Words Model")
    print(f"Accuracy: {bow_accuracy:.4f}")
    print("Confusion Matrix:")
    print(bow_conf_matrix)

    # Train and evaluate TF-IDF model
    tfidf_pipeline.fit(X_train, y_train)
    tfidf_predictions = tfidf_pipeline.predict(X_test)
    tfidf_accuracy = metrics.accuracy_score(y_test, tfidf_predictions)
    tfidf_conf_matrix = metrics.confusion_matrix(y_test, tfidf_predictions)
    print("\nTF-IDF Model")
    print(f"Accuracy: {tfidf_accuracy:.4f}")
    print("Confusion Matrix:")
    print(tfidf_conf_matrix)

classify_reviews(df)

Bag of Words Model
Accuracy: 0.8527
Confusion Matrix:
[[6460  951]
 [1259 6330]]

TF-IDF Model
Accuracy: 0.8593
Confusion Matrix:
[[6500  911]
 [1199 6390]]


In [27]:
df

Unnamed: 0,review,sentiment,processed_review
0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch 1 oz episode hook right...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production < br /><br />the f...
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei love Time money visually stunnin...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,think movie right good job creative original e...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad act idiotic directin...
49997,I am a Catholic taught in parochial elementary...,negative,Catholic teach parochial elementary school nun...
49998,I'm going to have to disagree with the previou...,negative,go disagree previous comment Maltin second rat...
