In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [2]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CPN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CPN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
# Load the dataset
data = pd.read_excel('./project_dataset.xlsx')

In [5]:
data['Sentiments'] = data['Sentiments'].map({'positive': 1, 'negative': 0})


In [6]:
data.head()

Unnamed: 0,Text,Sentiments
0,nice hotel expensive parking got good deal sta...,1
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not positive* experience hotel mona...,0
3,"unique, great stay, wonderful time hotel monac...",1
4,"great stay great stay, went seahawk game aweso...",1


In [7]:
print(data.describe())

         Sentiments
count  20491.000000
mean       0.736567
std        0.440506
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000


In [8]:
# Preprocess the data
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text


In [9]:
# Apply preprocessing to the 'Text' column
data['Text'] = data['Text'].apply(preprocess_text)

In [10]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['Text'], data['Sentiments'], test_size=0.2, random_state=42)

In [11]:
# Create a TF-IDF vectorizer and transform the training data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)

In [12]:
# Transform the testing data
test_vectors = vectorizer.transform(test_data)

In [13]:
print("Train vector shape:", train_vectors.shape)
print("Test vector shape:", test_vectors.shape)

Train vector shape: (16392, 43660)
Test vector shape: (4099, 43660)


In [14]:
import pickle
# Save the fitted vectorizer
vectorizer_path = './vectorizer.pkl'
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)

In [15]:
# Create a Random Forest classifier
classifier = RandomForestClassifier()

In [16]:
# Train the classifier
classifier.fit(train_vectors, train_labels)

In [17]:
# Make predictions on the testing data
predictions = classifier.predict(test_vectors)

In [18]:
# Calculate the accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8428885093925348


['sentiment_model.pkl']

In [19]:
# Save the trained model
model_path = 'sentiment_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(classifier, f)