In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [None]:
pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import pandas as pd
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score

df = pd.read_csv('/content/laptop_train.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=4)

train_encodings = tokenizer(list(train_df['Sentence']), list(train_df['Aspect Term']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['Sentence']), list(test_df['Aspect Term']), truncation=True, padding=True)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_encodings['token_type_ids']), torch.tensor(train_df['polarity'].values))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), torch.tensor(test_encodings['token_type_ids']), torch.tensor(test_df['polarity'].values))

batch_size = 16
epochs = 3
learning_rate = 2e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

optimizer = AdamW(model.parameters(), lr=learning_rate)

epoch_acc = []
f1_scores = []

for epoch in range(epochs):
    model.train()
    predictions = []  # clear predictions list at the start of each epoch
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        labels = batch[3].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

        loss = outputs[0]
        loss.backward()

        optimizer.step()

    model.eval()
    predictions = []
    labels_list = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            labels = batch[3].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            _, predicted = torch.max(outputs[0], 1)
            predictions.extend(predicted.tolist())
            labels_list.extend(labels.tolist())

    accuracy = (sum([1 if predictions[i]==labels_list[i] else 0 for i in range(len(predictions))]) / len(predictions)) * 100
    print(f"Epoch {epoch+1} - Overall Accuracy: {accuracy:.2f}%")

    report = classification_report(labels_list, predictions, digits=4, output_dict=True)
    f1_score = report['macro avg']['f1-score']
    print(f"Epoch {epoch+1} - Overall F1 Score: {f1_score:.4f}")


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Epoch 1 - Overall Accuracy: 76.69%
Epoch 1 - Overall F1 Score: 0.5047


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2 - Overall Accuracy: 81.78%
Epoch 2 - Overall F1 Score: 0.5901


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3 - Overall Accuracy: 81.99%
Epoch 3 - Overall F1 Score: 0.5884


In [None]:
import numpy as np

# Convert predictions to a Numpy array
predictions_np = np.array(predictions)

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('/content/laptop_train.csv')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Sentence'], data['polarity'], test_size=0.2, random_state=42)

# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the SVM model
svm = SVC(kernel='linear')
svm.fit(X_train_vec, y_train)

# Predict the sentiment on the testing set
y_pred = svm.predict(X_test_vec)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.60      0.36      0.45        92
           2       0.74      0.81      0.77       190
           3       0.72      0.80      0.76       186

    accuracy                           0.71       472
   macro avg       0.51      0.49      0.49       472
weighted avg       0.70      0.71      0.70       472



In [None]:
y_pred.shape

(472,)

In [None]:
predictions_np.shape

(472,)

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

# Combine predictions using LightGBM
lgbm = LGBMClassifier()
lgbm.fit(np.column_stack((y_pred, predictions_np)), y_test)
combined_preds = lgbm.predict(np.column_stack((y_pred, predictions_np)))

# Evaluate the combined model
from sklearn.metrics import classification_report
print(classification_report(y_test, combined_preds,digits=4))

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         4
           1     0.7164    0.5217    0.6038        92
           2     0.7719    0.9263    0.8421       190
           3     0.8870    0.8441    0.8650       186

    accuracy                         0.8072       472
   macro avg     0.5938    0.5730    0.5777       472
weighted avg     0.7999    0.8072    0.7975       472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score

df = pd.read_csv('/content/laptop_train.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=4)

train_encodings = tokenizer(list(train_df['Sentence']), list(train_df['Aspect Term']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['Sentence']), list(test_df['Aspect Term']), truncation=True, padding=True)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_df['polarity'].values))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), torch.tensor(test_df['polarity'].values))

batch_size = 16
epochs = 5
learning_rate = 5e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

optimizer = AdamW(model.parameters(), lr=learning_rate)

epoch_acc = []
f1_scores = []

for epoch in range(epochs):
    model.train()
    predictions_bert = []  # clear predictions list at the start of each epoch
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs[0]
        loss.backward()

        optimizer.step()

    model.eval()
    predictions = []
    labels_list = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            _, predicted = torch.max(outputs[0], 1)
            predictions_bert.extend(predicted.tolist())
            labels_list.extend(labels.tolist())

    accuracy = (sum([1 if predictions_bert[i]==labels_list[i] else 0 for i in range(len(predictions_bert))]) / len(predictions_bert)) * 100
    print(f"Epoch {epoch+1} - Overall Accuracy: {accuracy:.2f}%")

    report = classification_report(labels_list, predictions_bert, digits=4, output_dict=True)
    f1_score = report['macro avg']['f1-score']
    print(f"Epoch {epoch+1} - Overall F1 Score: {f1_score:.4f}")


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch 1 - Overall Accuracy: 74.79%
Epoch 1 - Overall F1 Score: 0.4997


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2 - Overall Accuracy: 75.85%
Epoch 2 - Overall F1 Score: 0.5347


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3 - Overall Accuracy: 75.85%
Epoch 3 - Overall F1 Score: 0.5287
Epoch 4 - Overall Accuracy: 74.79%
Epoch 4 - Overall F1 Score: 0.5323


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5 - Overall Accuracy: 77.97%
Epoch 5 - Overall F1 Score: 0.6109


In [None]:
import numpy as np

# Convert predictions to a Numpy array
predictions_bert = np.array(predictions_bert)

In [None]:
predictions_bert.shape

(472,)

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

# Combine predictions using LightGBM
lgbm = LGBMClassifier()
lgbm.fit(np.column_stack((predictions_bert, predictions_np)), y_test)
combined_preds = lgbm.predict(np.column_stack((predictions_bert, predictions_np)))

# Evaluate the combined model
from sklearn.metrics import classification_report
print(classification_report(y_test, combined_preds,digits=4))

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         4
           1     0.7143    0.5435    0.6173        92
           2     0.8469    0.8737    0.8601       190
           3     0.8301    0.9194    0.8724       186

    accuracy                         0.8199       472
   macro avg     0.5978    0.5841    0.5875       472
weighted avg     0.8073    0.8199    0.8104       472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
