#### Step 1: Download the IMDB dataset and take a look at it. To keep the notebool lightweight, we only grab a part of the dataset.

In [53]:
from datasets import load_dataset
#load the IMDP dataset
imdb_dataset = load_dataset("imdb")

#take a look at the data structure
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

To ensure the noteook remains small, we can work with a subset of the data for training and testing:

In [54]:
#take a small subset for training and testing
train_data = imdb_dataset['train'].shuffle(seed=42).select(range(1000))
test_data = imdb_dataset['test'].shuffle(seed=42).select(range(500))

#preview the first example
print(train_data[0])                                                        

{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}


#### Step 2: Preprocessing the traina and test data. This step includes tokenization, stemming, lemmatization, and removing stop words. 

In [55]:
# import necessary libraries for NLP processing
import nltk
from nltk.tokenize import TreebankWordTokenizer #for breaking text into words
from nltk.stem import PorterStemmer, WordNetLemmatizer #for stemming and lemmatization
from nltk.corpus import wordnet, stopwords #for accessing word meaning and stopwords
from nltk import pos_tag #for part of speech tagging

#download required resources for tokenization, lemmatization, and stop words removal
nltk.data.path.append('/Users/smirghor/nltk_data')
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger_eng') #POS tagger for lemmatization
nltk.download('stopwords') #stop words list

#innitialize the stemmer and lemmatizer
tokenizer = TreebankWordTokenizer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) #define a set of common english stop words


# Function to convert POS tags to a format suitable for WordNet lemmatization
def get_wordnet_pos(tag):
    # Map POS tag to WordNet format for accurate lemmatization
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun if unknown


# Function to preprocess a given text 
def preprocess_text(text):
    # 1. tokenize the text into words
    tokens = tokenizer.tokenize(text)
    # 2. apply stemming
    tokens = [stemmer.stem(token) for token in tokens if token.isalpha()]
    #3. apply lemmatization
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag([token])[0][1])) for token in tokens]
    #4. remove stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]
    # join the token back into a single string and return it
    return ' '.join(tokens)

#Example: apply preprocessing to a few samples from training data
preprocessed_train_data = [preprocess_text(review) for review in train_data['text'][:10]]

#display the first two preprocessed reviews to see the results
print(preprocessed_train_data[:2])

['relat fortier profil fact polic seri violent profil look crispi fortier look profil plot quit fortier plot far complic fortier look like prime suspect spot similar main charact weak weirdo clairvoy peopl like compar judg enjoy funni thing peopl write fortier look american hand argu prefer american seri mayb languag spirit think thi seri english way actor realli good act superfici', 'thi movi plot veri true book classic write mark movi start scene hank sing song bunch kid call stub toe moon remind sinatra song high hope fun music great throughout favorit song sung king hank bing crosbi sir saggi overal great famili movi even great date thi movi watch princess play rhonda fleme love thi movi like danni kay court jester definit like thi movi']


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
# Apply preprocessing to the training data (subset of 1000)
preprocessed_train_data = [preprocess_text(review['text']) for review in train_data]

# Apply preprocessing to the test data (subset of 500)
preprocessed_test_data = [preprocess_text(review['text']) for review in test_data]

#### Step 3: Transforming Text Data into TF-IDF Features (to convert the preprocessed text data into numerical features suitable for modeling)

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

#initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  #limit to 5000 features to keep it manageable

# fit and transform the preprocessed training data
X_train_tfidf = tfidf_vectorizer.fit_transform(preprocessed_train_data) 

# transform the test data 
X_test_tfidf = tfidf_vectorizer.transform(preprocessed_test_data)

#extract the labels (0=negative, 1=positive)
y_train = [review for review in train_data['label'][:1000]]
y_test = [review for review in test_data['label'][:500]]

#display the shape of the TF-IDF feature matrix
print(f"TF-IDF matrix shape(train): {X_train_tfidf.shape}")

TF-IDF matrix shape(train): (1000, 5000)


#### Step 4: Building a logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#initialize the logistic regression model 
logistic_model = 