In [43]:
import kaggle
import pandas as pd
import os
import spacy
from spacy.lang.en import English
from collections import Counter

In [112]:
if not os.path.exists(r"C:\Users\seifa\Desktop\Python\Beginner_natural_language_processing\iMDB-50k\IMDB Dataset.csv"):
    os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser('~/.kaggle')
    kaggle.api.dataset_download_files('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews', path='iMDB-50k', unzip=True)
    print("Dataset downloaded and unzipped to: iMDB-50k/")

In [113]:
df = pd.read_csv(r"C:\Users\seifa\Desktop\Python\Beginner_natural_language_processing\iMDB-50k\IMDB Dataset.csv")

In [114]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [44]:
# Initialize blank spaCy English language model for tokenization
nlp_for_tokenisation = spacy.blank("en")
tokenizer = nlp_for_tokenisation.tokenizer

# Initialize spaCy English language model for lemmatization
nlp_for_lemmatization = spacy.load("en_core_web_sm",
     disable = [
        "parser",        # dependency parsing
        "ner",           # named entity recognition
        "sentencizer",   # sentence boundaries
        "textcat",       # pretrained text classifier
        "entity_ruler"   # rule-based entities
    ])

In [46]:
counter = Counter()

html_patterns = ['<br', '<div', '<p', '</', '/>', 'br']
punctuation = {',', '.', '"', "'", '-', ':', ';', '(', ')', '[', ']', '{', '}', '...', '--', '—', '“', '”', '‘', '’', '<', '>', '!', '?', '@', '#', '$', '%', '^', '&', '*', '+', '=', '\\', '|', '~', '`', '–', '•', '…', '‹', '›', '..', '....'}
for review in df['review']:   
    tokens = [token.text.lower() for token in tokenizer(review)
              if token.text.lower() not in nlp_for_tokenisation.Defaults.stop_words and token.text not in punctuation
                and not any(pattern in token.text for pattern in html_patterns)
              ]
    counter.update(tokens)

In [117]:
# Most common tokens
most_common = counter.most_common(20)
print(most_common)

[('movie', 86007), ('film', 78000), ('like', 39833), ('good', 29262), ('time', 24338), ('story', 22601), ('bad', 18078), ('people', 17904), ('great', 17883), ('way', 15248), ('movies', 14977), ('think', 14240), ('characters', 14179), ('character', 13691), ('watch', 13552), ('films', 13474), ('seen', 13146), ('love', 12840), ('plot', 12708), ('acting', 12631)]


In [118]:
df['token'] = df['review'].apply(
    lambda review: [
        token.text.lower() for token in tokenizer(review)
        if token.text.lower() not in nlp_for_tokenisation.Defaults.stop_words
        and token.text not in punctuation
        and not any(pattern in token.text for pattern in html_patterns)
    ]
)

df['lemma'] = df['review'].apply(
    lambda review: [
        token.lemma_.lower() for token in nlp_for_lemmatization(review)
        if token.text.lower() not in nlp_for_tokenisation.Defaults.stop_words
        and token.text not in punctuation
        and not any(pattern in token.text for pattern in html_patterns)
    ]
)

In [119]:
print(len(df['review'][1].split()))
print(len(df['token'][1]))

162
76


In [120]:
print(f"original review : {tokenizer(df['review'][1]).text}") 
print('VS')
print(f"tokenized review : {df['token'][1]}") 
print('VS')
print(f"lemmetized review : {df['lemma'][1]}")


original review : A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are ter

In [121]:
df.head()   

Unnamed: 0,review,sentiment,token,lemma
0,One of the other reviewers has mentioned that ...,positive,"[reviewers, mentioned, watching, 1, oz, episod...","[reviewer, mention, watch, 1, oz, episode, hoo..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[think, wonderful, way, spend, time, hot, summ..."
3,Basically there's a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...","[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visually, ..."


In [35]:
df.to_csv(r"C:\Users\seifa\Desktop\Python\Beginner_natural_language_processing\iMDB-50k\preprocessed_imdb_50k.csv", index=False)

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import ast
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df=pd.read_csv(r"C:\Users\seifa\Desktop\Python\Beginner_natural_language_processing\iMDB-50k\preprocessed_imdb_50k.csv")

In [4]:
print(f"type of the 'lemma' column before conversion: {type(df['lemma'].iloc[0])}")

# Convert stringified lists back into actual lists
df['lemma'] = df['lemma'].apply(ast.literal_eval)

print(f"type of the 'lemma' column after conversion: {type(df['lemma'].iloc[0])}")

# TfidfVectorizer expects strings, not lists of tokens.
df['clean_text'] = df['lemma'].apply(lambda tokens: " ".join(tokens))

type of the 'lemma' column before conversion: <class 'str'>
type of the 'lemma' column after conversion: <class 'list'>


In [5]:
def data_split(df):
    X_train, X_test, y_train, y_test = train_test_split(
        df['clean_text'],  
        df['sentiment'].map({'positive': 1, 'negative': 0}),
        test_size=0.2,
        random_state=42
    )
    return X_train, X_test, y_train, y_test

In [6]:
def tfidf_vectorization(X_train, X_test, max_features):
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_train_tfidf = vectorizer.fit_transform(X_train)  # fit only on train
    X_test_tfidf = vectorizer.transform(X_test)        # transform test
    return X_train_tfidf, X_test_tfidf, vectorizer

In [8]:
features = [2000, 5000, 10000, 50000] 
model = ['LogisticRegression', 'LinearSVC' ]
results = {}

def train(df):
    X_train, X_test, y_train, y_test = data_split(df)

    for feature in features:
        X_train_tfidf, X_test_tfidf, vectorizer = tfidf_vectorization(X_train, X_test, max_features=feature)

        for m in model:
            if m == 'LogisticRegression':
                clf = LogisticRegression(max_iter=1000)
            elif m == 'LinearSVC':
                clf = LinearSVC()

            clf.fit(X_train_tfidf, y_train)
            y_pred = clf.predict(X_test_tfidf)
            acc = accuracy_score(y_test, y_pred)
            results[(m, feature)] = {
                                    "accuracy": acc,
                                    "report": classification_report(y_test, y_pred),
                                    "vectorizer": vectorizer,
                                    "model": clf
                        }
            print(f"Training with Model: {m}, {feature} features...")
            print("Accuracy:", acc)
train(df)

Training with Model: LogisticRegression, 2000 features...
Accuracy: 0.8745
Training with Model: LinearSVC, 2000 features...
Accuracy: 0.8698
Training with Model: LogisticRegression, 5000 features...
Accuracy: 0.8798
Training with Model: LinearSVC, 5000 features...
Accuracy: 0.8737
Training with Model: LogisticRegression, 10000 features...
Accuracy: 0.8855
Training with Model: LinearSVC, 10000 features...
Accuracy: 0.8777
Training with Model: LogisticRegression, 50000 features...
Accuracy: 0.8888
Training with Model: LinearSVC, 50000 features...
Accuracy: 0.8876


In [9]:
best_key = max(results, key=lambda k: results[k]["accuracy"])
best_acc = results[best_key]["accuracy"]
best_report = results[best_key]["report"]
print(f"Best Model: {best_key[0]}, max_features={best_key[1]}")
print("Accuracy:", best_acc)
print(best_report)

Best Model: LogisticRegression, max_features=50000
Accuracy: 0.8888
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [None]:
# Example new review
new_text = "This movie was thrilling and full of suspense."


vectorizer = results[best_key]["vectorizer"]
clf = results[best_key]["model"]

# Preprocess: tokenize, remove stopwords/punctuation, lemmatize
tokens = [token.lemma_.lower() for token in nlp_for_lemmatization(new_text)
          if token.text.lower() not in nlp_for_tokenisation.Defaults.stop_words
          and token.text not in punctuation
          and not any(pattern in token.text for pattern in html_patterns)]

clean_text = " ".join(tokens)

# Transform with TF-IDF vectorizer (same as trained)
X_new = vectorizer.transform([clean_text])

# Predict with best classifier
pred = clf.predict(X_new)  # best_clf = trained classifier
print("Predicted sentiment:", "positive" if pred[0] == 1 else "negative")


Predicted sentiment: positive


In [16]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
from collections import Counter
import ast
import time



In [2]:
df = pd.read_csv(r"C:\Users\seifa\Desktop\Python\Beginner_natural_language_processing\iMDB-50k\preprocessed_imdb_50k.csv")

df['token'] = df['token'].apply(ast.literal_eval)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

df.head()

Unnamed: 0,review,sentiment,token,lemma
0,One of the other reviewers has mentioned that ...,1,"[reviewers, mentioned, watching, 1, oz, episod...","['reviewer', 'mention', 'watch', '1', 'oz', 'e..."
1,A wonderful little production. <br /><br />The...,1,"[wonderful, little, production, filming, techn...","['wonderful', 'little', 'production', 'filming..."
2,I thought this was a wonderful way to spend ti...,1,"[thought, wonderful, way, spend, time, hot, su...","['think', 'wonderful', 'way', 'spend', 'time',..."
3,Basically there's a family where a little boy ...,0,"[basically, family, little, boy, jake, thinks,...","['basically', 'family', 'little', 'boy', 'jake..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, mattei, love, time, money, visually, ...","['petter', 'mattei', 'love', 'time', 'money', ..."


In [3]:
# 1. Build vocabulary
token_counts = Counter(token for review in df['token'] for token in review)
vocab = {token: idx + 2 for idx, (token, _) in enumerate(token_counts.items())}  # +2 for PAD/UNK
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

In [4]:
# 2. Numericalize
def numericalize(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

numericalized_texts = [numericalize(tokens) for tokens in df['token']]

In [5]:
# 3. Pad / truncate
lengths = [len(tokens) for tokens in df['token']]
max_len = int(np.percentile(lengths, 90))
def pad_sequence(seq, max_len):
    return seq[:max_len] + [vocab['<PAD>']] * max(0, max_len - len(seq))

padded_texts = [pad_sequence(seq, max_len) for seq in numericalized_texts]

In [7]:
# 4. Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

dataset = TextDataset(padded_texts, df['sentiment'])

In [11]:
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [22]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, dropout_p=0.5):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_p)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)           # [batch, seq_len, embed_dim]
        x = x.mean(dim=1)               # average pooling
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [23]:
model = TextClassifier(vocab_size=len(vocab), embed_dim=128, hidden_dim=64, num_classes=2)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

device

device(type='cuda')

In [25]:
def train():
    total_loss = 0
    total_correct = 0
    total_samples = 0

    model.train()
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        output = model(x)
        optimizer.zero_grad()
        batch_loss = criterion(output, y)
        batch_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()

        total_loss += batch_loss.item() * x.size(0)  # sum weighted by batch size
        pred = output.argmax(dim=1)
        total_correct += pred.eq(y).sum().item()
        total_samples += x.size(0)

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    print(f'Train - Loss: {avg_loss:.4f} Accuracy: {accuracy:.4f} \n')

def valid():
    total_loss = 0
    total_correct = 0
    total_samples = 0

    model.eval()
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            batch_loss = criterion(output, y)

            total_loss += batch_loss.item() * x.size(0)
            pred = output.argmax(dim=1)
            total_correct += pred.eq(y).sum().item()
            total_samples += x.size(0)

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    print(f'Valid - Loss: {avg_loss:.4f} Accuracy: {accuracy:.4f}\n')

In [26]:
epochs = 10

for epoch in range(1, epochs+1):
    start_time=time.time()
    print(f"Epoch nb {epoch} Starting...")
    train()
    valid()
    end_time=time.time()
    print(f"epoch duration is {end_time-start_time:.2f}s")

print(f"training complete!")


Epoch nb 1 Starting...
Train - Loss: 0.6637 Accuracy: 0.5763 

Valid - Loss: 0.5002 Accuracy: 0.7818

epoch duration is 8.45s
Epoch nb 2 Starting...
Train - Loss: 0.3931 Accuracy: 0.8308 

Valid - Loss: 0.3223 Accuracy: 0.8692

epoch duration is 7.48s
Epoch nb 3 Starting...
Train - Loss: 0.2878 Accuracy: 0.8854 

Valid - Loss: 0.2990 Accuracy: 0.8784

epoch duration is 7.55s
Epoch nb 4 Starting...
Train - Loss: 0.2439 Accuracy: 0.9072 

Valid - Loss: 0.2840 Accuracy: 0.8864

epoch duration is 7.63s
Epoch nb 5 Starting...
Train - Loss: 0.2120 Accuracy: 0.9218 

Valid - Loss: 0.3209 Accuracy: 0.8638

epoch duration is 7.57s
Epoch nb 6 Starting...
Train - Loss: 0.1862 Accuracy: 0.9342 

Valid - Loss: 0.3076 Accuracy: 0.8846

epoch duration is 7.58s
Epoch nb 7 Starting...
Train - Loss: 0.1613 Accuracy: 0.9442 

Valid - Loss: 0.3238 Accuracy: 0.8844

epoch duration is 7.85s
Epoch nb 8 Starting...
Train - Loss: 0.1452 Accuracy: 0.9499 

Valid - Loss: 0.3246 Accuracy: 0.8860

epoch duration i

In [21]:
def test():
    total_loss = 0
    total_correct = 0
    total_samples = 0

    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            preds = output.argmax(dim=1)

            batch_loss = criterion(output, y)
            total_loss += batch_loss.item() * x.size(0)
            total_correct += preds.eq(y).sum().item()
            total_samples += x.size(0)
            
            y_true.extend(y.cpu().tolist())
            y_pred.extend(preds.cpu().tolist())

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    print(f'Test - Loss: {avg_loss:.4f} Accuracy: {accuracy:.4f}\n')
    return y_true, y_pred

y_true, y_pred = test()

Test - Loss: 0.8863 Accuracy: 0.8766

