In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/shubhsudan/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/shubhsudan/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/shubhsudan/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/shubhsudan/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/shubhsudan/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    |

True

In [3]:
# Load CSV data into a DataFrame
data = pd.read_csv('/Users/shubhsudan/Downloads/IMDB Dataset.csv')

In [4]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [17]:


# Preprocessing function
def preprocess_text(text):
    # Remove symbols
    cleaned_text = re.sub(r'[^\w\s]', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in cleaned_text.split() if token.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return ' '.join(lemmatized_tokens)

# Apply preprocessing to the 'review' column
data['cleaned_text'] = data['review'].apply(preprocess_text)

In [18]:
# Vectorize cleaned text using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_text'])

In [19]:
# Define X and y
X = tfidf_matrix
y = data['sentiment']


In [7]:
from gensim.models import Word2Vec

# Create Word2Vec model
word2vec_model = Word2Vec(sentences=data['cleaned_text'].apply(lambda x: x.split()), vector_size=100, window=2, min_count=1, sg=0)



In [8]:
# Function to convert text to Word2Vec embeddings
def get_word2vec_embeddings(text):
    embeddings = []
    for word in text.split():
        if word in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[word])
    return embeddings

data['word2vec_embeddings'] = data['cleaned_text'].apply(get_word2vec_embeddings)

In [9]:
google_word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/Users/shubhsudan/Downloads/archive-3/GoogleNews-vectors-negative300.bin', binary=True)

def get_google_word2vec_embeddings(text):
    embeddings = []
    for word in text.split():
        if word in google_word2vec_model:
            embeddings.append(google_word2vec_model[word])
    return embeddings

data['google_word2vec_embeddings'] = data['cleaned_text'].apply(get_google_word2vec_embeddings)

In [10]:
from sklearn.preprocessing import StandardScaler

# Standardize the Word2Vec embeddings
scaler = StandardScaler()
data['word2vec_scaled'] = data['word2vec_embeddings'].apply(lambda x: scaler.fit_transform(x))

# Standardize the GoogleNews Word2Vec embeddings
data['google_word2vec_scaled'] = data['google_word2vec_embeddings'].apply(lambda x: scaler.fit_transform(x))

# Vectorization methods and their corresponding data
vectorization_methods = ['CountVectorizer', 'TF-IDF', 'Word2Vec', 'GoogleNews Word2Vec']
vectorization_data = [tfidf_matrix, data['word2vec_scaled'], data['google_word2vec_scaled']]



In [11]:
# ML algorithms
algorithms = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier()
}


In [14]:
X = data['review']  # Features (input data)
y = data['sentiment']  # Target labels

# Convert 'y' to numeric values if needed
# For example, if 'y' contains string labels like 'positive', 'negative', 'neutral'
# You can encode them using label encoding or one-hot encoding
# For label encoding:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [21]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X[:1000], y[:1000], test_size=0.2, random_state=42)

# ML algorithms
algorithms = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Iterate through ML algorithms
for algo_name, algo in algorithms.items():
    print(f"{algo_name}:\n")
    algo_params = {}
    
    if algo_name == 'Logistic Regression':
        algo_params = {'solver': ['liblinear', 'lbfgs']}
    elif algo_name == 'SVC':
        algo_params = {'C': [1, 10, 100], 'kernel': ['linear', 'rbf']}
    elif algo_name == 'Random Forest':
        algo_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}
    
    algo_grid = GridSearchCV(algo, algo_params)
    algo_grid.fit(X_train, y_train)
    algo_pred = algo_grid.predict(X_test)
    algo_report = classification_report(y_test, algo_pred)
    print(algo_report)
    print("="*50)

Logistic Regression:

              precision    recall  f1-score   support

    negative       0.85      0.78      0.81       104
    positive       0.78      0.85      0.82        96

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200

SVC:

              precision    recall  f1-score   support

    negative       0.88      0.83      0.85       104
    positive       0.82      0.88      0.85        96

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200

Random Forest:

              precision    recall  f1-score   support

    negative       0.89      0.72      0.80       104
    positive       0.75      0.91      0.82        96

    accuracy                           0.81       200
   macro avg       0.82      0.81      0.81       200
weighted avg       0.82      0