## Step-1

**Import the packages**

In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omkar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Step-2
**Read the data**

In [2]:
df = pd.read_excel('Sentiment_analaysis_word_2_vec_finetune\IMDB Dataset_sample.xlsx')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Step-3

**Preprocess the text data**

In [3]:
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = stopwords.words("english")
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens


# Apply preprocessing to the text data
df['tokens'] = df['review'].apply(preprocess_text)
df

Unnamed: 0,review,sentiment,tokens
0,One of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,"[basically, theres, family, little, boy, jake,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, matteis, love, time, money, visually,..."
...,...,...,...
1109,I have not read the book that this was based u...,positive,"[read, book, based, uponinspired, ofthe, other..."
1110,"Brilliant thriller, deserving far more fame, M...",positive,"[brilliant, thriller, deserving, far, fame, mi..."
1111,I saw this movie a fews years ago and was lite...,positive,"[saw, movie, fews, years, ago, literally, swep..."
1112,I just watched this movie on Bravo! and it was...,negative,"[watched, movie, bravo, absolutely, horrible, ..."


$Step-4$

**Train the Word2Vec Model**

In [4]:
# Train Word2Vec model
model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, sg=1)

In [5]:
model

<gensim.models.word2vec.Word2Vec at 0x229c8771650>

**model vocabulary**

In [6]:
# Get the list of words in the vocabulary
vocabulary_list = list(model.wv.index_to_key)

# Print the vocabulary
print(len(vocabulary_list))

21781


**word vector**

In [7]:
df['tokens'][0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 'oz',
 'episode',
 'youll',
 'hooked',
 'right',
 'exactly',
 'happened',
 'mebr',
 'br',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'wordbr',
 'br',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'manyaryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'moreso',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'awaybr',
 'br',
 'would',
 'say',
 'main',
 'appeal',
 'sho

In [8]:
len(model.wv['one'])

100

## Step-5

**Create Sentence Vectors**

In [9]:
def get_sentence_vector(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(100)
    return np.mean(word_vectors, axis=0)

# Create sentence vectors
df['sentence_vector'] = df['tokens'].apply(lambda tokens: get_sentence_vector(tokens, model))
X = np.vstack(df['sentence_vector'])
y = df['sentiment']


In [10]:
X

array([[-0.0960805 ,  0.22906654,  0.07162032, ..., -0.21624544,
         0.03138896,  0.067889  ],
       [-0.0995374 ,  0.24531554,  0.06818158, ..., -0.22688092,
         0.03846664,  0.08647707],
       [-0.10225654,  0.25646853,  0.06438303, ..., -0.237278  ,
         0.02870193,  0.07385214],
       ...,
       [-0.10755026,  0.24893445,  0.07027991, ..., -0.23195483,
         0.02891569,  0.07173766],
       [-0.10833995,  0.27410564,  0.04659814, ..., -0.26424256,
         0.02913951,  0.07571834],
       [-0.10312305,  0.25518674,  0.06371851, ..., -0.23492046,
         0.02639818,  0.06758828]], dtype=float32)

## Step-6

**Train a Classifier**

In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)


## Step-7
**Evaluate the Model**

In [12]:
# Predict sentiment
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 61.43%


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'Support Vector Classifier': SVC(kernel='linear', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

# Print the results
for name, accuracy in results.items():
    print(f"{name} Accuracy: {accuracy * 100:.2f}%")





Random Forest Accuracy: 61.43%
Gradient Boosting Accuracy: 68.16%
AdaBoost Accuracy: 69.51%
Support Vector Classifier Accuracy: 57.40%
Logistic Regression Accuracy: 60.99%
Neural Network Accuracy: 66.37%


In [17]:
best_classifier=AdaBoostClassifier(n_estimators=100, random_state=42)
best_classifier
joblib.dump(classifier, 'ada_boost_model.pkl')

['ada_boost_model.pkl']

In [19]:
best_classifier=MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000)
joblib.dump(classifier, 'mlp_model.pkl')

['mlp_model.pkl']

# Step-8
**Save the models**

In [20]:
model

<gensim.models.word2vec.Word2Vec at 0x229c8771650>

In [21]:
model.save('Word_2_Vec.model')

In [15]:
import joblib

# Save the model
joblib.dump(classifier, 'random_forest_model.pkl')


['random_forest_model.pkl']