In [1]:
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()

In [2]:
import pickle
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
import os
import string

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>


In [3]:
data = pd.read_csv("data/train.csv")

In [4]:
data['text'] = data['review']

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    data,  # Your feature vectors
    data["label"],  # The true labels
    test_size=0.2,  # Specifies the proportion of data to include in the validation set
    random_state=42  # Ensures reproducibility
)

In [6]:
# A function used to build a vocabulary based on descending word frequencies
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [7]:
def preprocess_df(df):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [8]:
X_train["text"] = X_train["review"]
X_val["text"] = X_val["review"]
df_train = preprocess_df(X_train)
df_test = preprocess_df(X_val)

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

In [10]:
X_train.shape, y_train_encoded.shape

((10515, 63), (10515,))

In [11]:
df_train.shape, df_test.shape

((10515, 63), (2629, 63))

## TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer 
import re

In [14]:
ps = PorterStemmer() 

# return a list of tokens
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # remove non-words
    doc = re.sub(r'[^\w\s]', '', doc)
    # get sentences
    sentences = sent_tokenize(doc)
    # get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens]

In [15]:
#Initialize TfidfVectorizer with your custom tokenizer
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # Assuming preprocessing is already done
                        tokenizer=pre_processing_by_nltk,  # Use your custom tokenizer
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True,
                        min_df = 2,
                        max_df = 0.98,
                        ngram_range=(1, 3)
                        )

# Fit and transform the training data to create the training vectors
train_vec = tfidf.fit_transform(df_train["text"])
test_vec = tfidf.transform(df_test["text"])





In [16]:
train_vec.shape

(10515, 515814)

In [17]:
y_train_encoded.shape

(10515,)

In [18]:
clf1 = LogisticRegression(max_iter=1000000000, 
                           random_state=42, 
                           multi_class= "auto",
                            C = 10 ,
                            warm_start= True)
# Fit the model on the new training set
clf1.fit(train_vec, y_train_encoded)

# Predict on the validation set
val_preds = clf1.predict(test_vec)

In [19]:
macro_f1 = f1_score(y_val_encoded, val_preds, average='macro')
micro_f1 = f1_score(y_val_encoded, val_preds, average='micro')
weighted_f1 = f1_score(y_val_encoded, val_preds, average='weighted')
print(f'Macro-average F1 score: {macro_f1}')
print(f'Micro-average F1 score: {micro_f1}')
print(f'Weighted-average F1 score: {weighted_f1}')

Macro-average F1 score: 0.7181248242859979
Micro-average F1 score: 0.799923925446938
Weighted-average F1 score: 0.7833277133508931


In [41]:
from sklearn.metrics import accuracy_score

In [42]:
accuracy = accuracy_score(y_val_encoded, val_preds)

print(f'Accuracy: {accuracy}')

Accuracy: 0.7649296310384176


## Bert

In [20]:
import random
import torch
from transformers import BertTokenizer, BertModel

In [21]:
from transformers import DistilBertTokenizer, DistilBertModel

In [22]:
# Set a random seed
random_seed = 42
random.seed(random_seed)

In [23]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


In [24]:
encoding = tokenizer.batch_encode_plus(
    X_train['text'],                    # List of input texts
    padding=True,              # Pad to the maximum sequence length
    truncation=True,           # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True    # Add special tokens CLS and SEP
)

In [25]:
input_ids = encoding['input_ids']  # Token IDs
# print input IDs
print(f"Input ID: {input_ids}")
attention_mask = encoding['attention_mask']
print(f"Attention mask: {attention_mask}")

Input ID: tensor([[  101,  7842, 16475,  ...,     0,     0,     0],
        [  101,  2057,  2253,  ...,     0,     0,     0],
        [  101,  2204, 21122,  ...,     0,     0,     0],
        ...,
        [  101,  2026,  2564,  ...,     0,     0,     0],
        [  101,  2023,  2173,  ...,     0,     0,     0],
        [  101, 12090,  4840,  ...,     0,     0,     0]])
Attention mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [26]:
input_ids.shape

torch.Size([10515, 512])

In [27]:
# Determine the batch size
batch_size = 5 

# Initialize an empty list to hold the embeddings
word_embeddings = []

# Process in batches
for i in range(0, len(input_ids), batch_size):
    # Get the batch
    batch_input_ids = input_ids[i:i+batch_size]
    batch_attention_mask = attention_mask[i:i+batch_size]
    
    # Perform the forward pass and get the embeddings
    with torch.no_grad():
        batch_outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        batch_word_embeddings = batch_outputs.last_hidden_state
        word_embeddings.append(batch_word_embeddings)


word_embeddings = torch.cat(word_embeddings, dim=0)


In [28]:
# Encode the test data
test_encoding = tokenizer.batch_encode_plus(
    X_val['text'],                  # List of input texts from the test set
    padding=True,                    # Pad to the maximum sequence length
    truncation=True,                 # Truncate to the maximum sequence length if necessary
    return_tensors='pt',             # Return PyTorch tensors
    add_special_tokens=True          # Add special tokens CLS and SEP
)

test_input_ids = test_encoding['input_ids']
test_attention_mask = test_encoding['attention_mask']

# Generate embeddings for the test set
test_word_embeddings = []
with torch.no_grad():
    for i in range(0, len(test_input_ids), batch_size):
        batch_input_ids = test_input_ids[i:i+batch_size]
        batch_attention_mask = test_attention_mask[i:i+batch_size]
        batch_outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        batch_word_embeddings = batch_outputs.last_hidden_state
        test_word_embeddings.append(batch_word_embeddings)

# Concatenate all batches into one tensor for the test set
test_word_embeddings = torch.cat(test_word_embeddings, dim=0)


NN

In [36]:
! pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.16.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-16.0.6-py2.py3-none-macosx_11_0_a

In [37]:
num_classes = len(set(y_train_encoded))

In [84]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


# Assuming X_train is your input features matrix with shape [n_samples, n_features]
input_dim = 768  # For BERT embeddings, this would be 768. Adjust according to your feature size

model = Sequential([
    # First hidden layer
    Dense(512, activation='relu', input_shape=(input_dim,)),  # Increased to 512 units
    Dropout(0.5),
    # Second hidden layer
    Dense(256, activation='relu'),
    Dropout(0.5),
    # Third hidden layer
    # Dense(256, activation='relu'),
    # Dropout(0.5),
    # Fourth hidden layer
    # Dense(, activation='relu'),
    # Dropout(0.5),
    # Output layer
    Dense(num_classes, activation='softmax')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [85]:
from tensorflow.keras.optimizers import Adamax
# Define your custom learning rate
custom_learning_rate = 0.001
# Initialize the Adamax optimizer with your custom learning rate
optimizer = Adamax(learning_rate=custom_learning_rate)

In [86]:
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [87]:
history = model.fit(train_vec, y_train_encoded,
                    batch_size=10,
                    epochs=20,
                    validation_split=0.2)

Epoch 1/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2391 - loss: 2.1266 - val_accuracy: 0.5563 - val_loss: 1.5057
Epoch 2/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4914 - loss: 1.5378 - val_accuracy: 0.6334 - val_loss: 1.1374
Epoch 3/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5800 - loss: 1.2490 - val_accuracy: 0.6695 - val_loss: 0.9851
Epoch 4/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6288 - loss: 1.1195 - val_accuracy: 0.6795 - val_loss: 0.9460
Epoch 5/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6424 - loss: 1.0536 - val_accuracy: 0.6971 - val_loss: 0.9009
Epoch 6/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6603 - loss: 1.0008 - val_accuracy: 0.6966 - val_loss: 0.8707
Epoch 7/20
[1m842/842[0m 

In [90]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Generate predictions
predictions = model.predict(test_vec)
predictions = np.argmax(predictions, axis=1)  # Convert probabilities to class labels

# Calculate F1 score
macro_f1 = f1_score(y_val_encoded, predictions, average='macro')
micro_f1 = f1_score(y_val_encoded, predictions, average='micro')
weighted_f1 = f1_score(y_val_encoded, predictions, average='weighted')

print(f'Macro-average F1 score: {macro_f1}')
print(f'Micro-average F1 score: {micro_f1}')
print(f'Weighted-average F1 score: {weighted_f1}')

# Calculate accuracy
accuracy = accuracy_score(y_val_encoded, predictions)
print(f'Accuracy: {accuracy}')


[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 907us/step
Macro-average F1 score: 0.615314399931103
Micro-average F1 score: 0.7402054012932675
Weighted-average F1 score: 0.7135849007048611
Accuracy: 0.7402054012932674


LOGISTIC

In [29]:
# Average the embeddings across the sequence length for training set
train_vec = torch.mean(word_embeddings, dim=1).cpu().numpy()

# Average the embeddings across the sequence length for test set
test_vec = torch.mean(test_word_embeddings, dim=1).cpu().numpy()


In [30]:
train_vec.shape

(10515, 768)

In [31]:
test_vec.shape

(2629, 768)

In [32]:
clf2 = LogisticRegression(max_iter=1000000000, 
                           random_state=42, 
                           multi_class= "auto",
                            C = 10 ,
                            warm_start= True)
# Fit the model on the new training set
clf2.fit(train_vec, y_train_encoded)

# Predict on the validation set
val_preds = clf2.predict(test_vec)

In [33]:
macro_f1 = f1_score(y_val_encoded, val_preds, average='macro')
micro_f1 = f1_score(y_val_encoded, val_preds, average='micro')
weighted_f1 = f1_score(y_val_encoded, val_preds, average='weighted')
print(f'Macro-average F1 score: {macro_f1}')
print(f'Micro-average F1 score: {micro_f1}')
print(f'Weighted-average F1 score: {weighted_f1}')

Macro-average F1 score: 0.6910020482664272
Micro-average F1 score: 0.7649296310384177
Weighted-average F1 score: 0.7563900028673805
