In [2]:
# Import the necessary libraries
import os
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist

# Download stopwords if not already downloaded
nltk.download("stopwords")

# Define the path to your downloaded movie reviews dataset
dataset_directory = "txt_sentoken"  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:

# Check if the dataset directory is available in the specified path
if not os.path.exists(dataset_directory):
    print("Dataset directory not found. Please make sure you provide the correct path.")
else:
    # Load the movie reviews from the 'pos' and 'neg' folders
    pos_reviews = []  # List to store positive reviews
    neg_reviews = []  # List to store negative reviews

    # Loop through the 'pos' folder
    pos_directory = os.path.join(dataset_directory, "pos")
    for filename in os.listdir(pos_directory):
        with open(os.path.join(pos_directory, filename), "r", encoding="utf-8") as file:
            review = file.read()
            pos_reviews.append((review, 'positive'))

    # Loop through the 'neg' folder
    neg_directory = os.path.join(dataset_directory, "neg")
    for filename in os.listdir(neg_directory):
        with open(os.path.join(neg_directory, filename), "r", encoding="utf-8") as file:
            review = file.read()
            neg_reviews.append((review, 'negative'))

# Print the first few words of a positive and a negative review to check the preprocessing
print("Positive Review Example:", pos_reviews[0][:10])
print("Negative Review Example:", neg_reviews[0][:10])


Positive Review Example: ('films adapted from comic books have had plenty of success , whether they\'re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there\'s never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \'80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \nthe book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \nin other words , don\'t dismiss this film because of its source . \nif you can get past the whole comic book thing , you might find another stumbling block in from hell\'s directors , albert and allen hughes . \ngetting the hughes br

First,to identify positivity or negativity you will pos-tag the reviews and identify adverbes.

In [72]:
from nltk import pos_tag
import random

all_reviews = pos_reviews + neg_reviews

random.shuffle(all_reviews)

tagged_reviews = [(word_tokenize(review), label) for review, label in all_reviews]
tagged_reviews = [(pos_tag(tokens), label) for tokens, label in tagged_reviews]


def find_adverbs(tagged_review):
    adverbs = [word for word, tag in tagged_review if tag.startswith('RB')]
    return adverbs

# Identify adverbs in all the reviews
adverbs_in_reviews = [(find_adverbs(tagged_review), label) for tagged_review, label in tagged_reviews]

# Print the identified adverbs in a positive and a negative review
print("Adverbs in a Review:")
print(adverbs_in_reviews[0][0])




Adverbs in a Review:
['so', 'very', 'not', 'quite', 'not', 'still', 'only', 'later', 'actually', "n't", 'really', 'not', 'finally', 'once', 'even', 'there', 'only', 'finally', 'much', 'kelly', 'ultimately', 'not', 'never', 'visually', 'before', 'only', 'indeed', 'also', 'easily', 'however', 'usually', 'immediately', 'more', 'noticably', 'well', 'north', 'virgil', 'strangely', 'maybe', 'even', 'more', 'so', 'quickly', 'there', 'clearly', 'early', 'well', 'seriously', 'always', 'completely', 'nearly', 'usually', 'always', 'probably', 'little', 'more', 'up', 'not', 'here', 'then', 'always', 'finally', 'most', 'kelly', 'easily', 'instead', 'also', 'unexpectedly', 'overly', 'simply', 'finally', 'usually', 'too', 'appropriately', 'very', 'ever', 'so', 'perhaps', 'not', 'i', 'personally', 'even', 'simply', 'far', 'satisfyingly', 'still', 'together', 'even', 'usually']


Second you will use a lexical resource to classify the adverbes found.

In [73]:
from nltk.corpus import sentiwordnet as swn

def get_sentiment(adverb):
    
    synsets = list(swn.senti_synsets(adverb, 'r'))  # 'r' pour adverbes
    if not synsets:
        return 0  #if not found by SentiWordNet
    
    return synsets[0].pos_score() - synsets[0].neg_score()

sentiments_in_reviews = [(sum(get_sentiment(adverb) for adverb in adverbs), label) for adverbs, label in adverbs_in_reviews]

In [74]:
def classify_review(sum_score):
    return "pos" if sum_score > 0 else "neg"

predicted_labels = [classify_review(score) for score, _ in sentiments_in_reviews]

# Calculer la précision de la classification
actual_labels = [label for _, label in sentiments_in_reviews]
correctly_classified = sum(1 for predicted, actual in zip(predicted_labels, actual_labels) if predicted == actual)

accuracy = correctly_classified / len(predicted_labels)

print(f"Accuracy of classification: {accuracy * 100:.2f}%")

Accuracy of classification: 0.00%


In [75]:
X = [sum_ for sum_, _ in sentiments_in_reviews]
y = [label for _, label in sentiments_in_reviews]

In [76]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [77]:
from sklearn.linear_model import LogisticRegression

# Reshape les données car nous avons une seule caractéristique
X_train = [[x] for x in X_train]
X_test = [[x] for x in X_test]

# Entraîner le modèle
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [78]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

Accuracy: 48.75%

Classification Report:
              precision    recall  f1-score   support

    negative       0.51      0.21      0.30       206
    positive       0.48      0.78      0.60       194

    accuracy                           0.49       400
   macro avg       0.49      0.50      0.45       400
weighted avg       0.49      0.49      0.44       400



In [80]:

all_reviews = [review[0] for review in all_reviews]

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(all_reviews)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [84]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

Accuracy: 82.25%

Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.91      0.84       205
    positive       0.89      0.73      0.80       195

    accuracy                           0.82       400
   macro avg       0.83      0.82      0.82       400
weighted avg       0.83      0.82      0.82       400

