In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd /content/drive/MyDrive/pytorch

/content/drive/MyDrive/pytorch


In [8]:
import pandas as pd
import numpy as np

In [9]:
data = pd.read_csv("./sample_google_scholar.csv")

In [10]:
data = data.dropna()
data.head()

Unnamed: 0,author_name,email,affiliation,coauthors_names,research_interest
0,Lawrence Holder,wsu.edu,Washington State University,Diane J Cook##William Eberle,artificial_intelligence##machine_learning##dat...
3,Diane J Cook,eecs.wsu.edu,Washington State University,Lawrence Holder##Parisa Rashidi##Sajal K. Das#...,artificial_intelligence##machine_learning##sma...
4,Sumi Helal IEEE Fellow AAAS Fellow IET Fellow ...,cise.ufl.edu,University of Florida,Raja Bose##Darrell Woelk##Diane J Cook##Yousse...,digital_health##smart_homes##internet_of_thing...
5,Hani Hagras,essex.ac.uk,University of Essex,Christian Wagner,explainable_artificial_intelligence##ambient_i...
6,Anupam Joshi,umbc.edu,UMBC,Tim Finin##Yelena Yesha##Lalana Kagal##Dipanja...,data_management##mobile_computing##security##s...


In [11]:
# for features, it will convert first 10 characters of affliation into a vector of float
# by dividing each character by maximum axcii number

def convert_first_ten_characters_into_tensor(data):
  first_ten_characters = data[:10]
  converted = [ord(char)/256 for char in first_ten_characters]
  while len(converted) < 10:
    converted.append(0.0)
  return np.array(converted)

converted_affiliation = data['affiliation'].map(convert_first_ten_characters_into_tensor)
affiliation = np.vstack(converted_affiliation.values)
print(affiliation[:5])

[[0.33984375 0.37890625 0.44921875 0.40625    0.41015625 0.4296875
  0.40234375 0.453125   0.43359375 0.4296875 ]
 [0.33984375 0.37890625 0.44921875 0.40625    0.41015625 0.4296875
  0.40234375 0.453125   0.43359375 0.4296875 ]
 [0.33203125 0.4296875  0.41015625 0.4609375  0.39453125 0.4453125
  0.44921875 0.41015625 0.453125   0.47265625]
 [0.33203125 0.4296875  0.41015625 0.4609375  0.39453125 0.4453125
  0.44921875 0.41015625 0.453125   0.47265625]
 [0.33203125 0.30078125 0.2578125  0.26171875 0.         0.
  0.         0.         0.         0.        ]]


In [12]:
# for labels, it will be boolean valuel; True if email consist of '.edu' and False otherwise

converted_email = data['email'].str.contains('.edu')
labels = converted_email.values
labels[:5]

array([ True,  True,  True, False,  True])

In [13]:
pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.3-py3-none-any.whl (777 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.3.0.post0-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.10.1 pytorch_lightning-2.1.3 torchmetrics-1.3.0.post0


In [14]:
from pytorch_lightning import LightningDataModule
from typing import Optional
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [15]:
class SampleDataset(Dataset):
  def __init__(self):
    self.affiliation = torch.Tensor(affiliation)
    self.labels = torch.Tensor(labels)

  def __len__(self):
    """ return number of samples """
    return len(self.labels)

  def __getitem__(self, idx):
    """ load and returns a sample from the dataset at the index """
    return self.affiliation[idx], int(self.labels[idx])

In [83]:
class SampleDataModule(LightningDataModule):
  def __init__(self):
    super().__init__()

    self.batch_size = 10
    self.collate_fn = lambda x: x

  def prepare_data(self):
    """ Download and preprocess the data, triggered only a sinle GPU """
    pass

  def setup(self, stage: Optional[str] = None):
    """ Define necessary components for data loading on each GPU """
    pass

  def train_dataloader(self):
    """ define train data loader """
    return DataLoader(
        SampleDataset(),
        batch_size=self.batch_size,
        shuffle=True)

  def val_dataloader(self):
    """ define validation data loader """
    return DataLoader(
        SampleDataset(),
        batch_size=self.batch_size,
        shuffle=True
    )

  def test_dataloader(self):
      """ define test data loader """
      return DataLoader(
          SampleDataset(),
          batch_size=self.batch_size,
          shuffle=False
      )

## Model definition & model training

In [42]:
#pip install pytorch-forecasting
#!pip install --upgrade torchmetrics==0.9.1
!pip install pytorch-lightning==1.8.6

Collecting pytorch-lightning==1.8.6
  Downloading pytorch_lightning-1.8.6-py3-none-any.whl (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.3/800.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=2.2 (from pytorch-lightning==1.8.6)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX, pytorch-lightning
  Attempting uninstall: pytorch-lightning
    Found existing installation: pytorch-lightning 2.1.3
    Uninstalling pytorch-lightning-2.1.3:
      Successfully uninstalled pytorch-lightning-2.1.3
Successfully installed pytorch-lightning-1.8.6 tensorboardX-2.6.2.2


In [63]:
from pytorch_lightning.core.module import LightningModule
from torch.optim.adam import Adam
#from torchmetrics.functional import accuracy

from torchmetrics import Accuracy

In [79]:
class SampleModel(LightningModule):
  def __init__(self):
        super().__init__()

        # define Network
        self.layers = nn.Sequential(
            nn.Linear(10, 5),
            nn.Linear(5, 3),
            nn.Linear(3, 1),
            nn.Sigmoid()
        )
       # self.layers = model
        # define loss function (BCELoss)
        self.criterion = torch.nn.BCELoss()

  def forward(self, x):
    """ Feed input tensor to the networks and compute output """
    output_tensor = self.layers(x)
    return output_tensor

  def configure_optimizers(self):
    """ Define optimizer to use """
    return torch.optim.Adam(model.parameters(), lr=0.1)

  def training_step(self, batch, batch_idx):
    """ Define single training iteration """
    x, y = batch
    ## reformat targets for BCLoss """
    targets = y.unsqueeze(dim=1).to(torch.float32)
    outputs = self(x)
    loss = self.criterion(outputs, targets)
    return loss

  def validation_step(self, batch, batch_idx):
    loss, acc = self._shared_eval_step(batch, batch_idx)
    metrics = {"val_acc": acc, "val_loss": loss}
    self.log_dict(metrics)
    return metrics

  def test_step(self, batch, batch_idx):
    """ Define single test iteration """
    loss, acc = self._shared_eval_step(batch, batch_idx)
    metrics = {"test_acc": acc, "test_loss": loss}
    self.log_dict(metrics)
    return metrics

  def _shared_eval_step(self, batch, bathc_idx):
      x, y = batch
      # reformat targets for BCELoss
      targets = y.unsqueeze(dim=1).to(torch.float32)
      outputs = self(x)
      loss = self.criterion(outputs, targets)
      accuracy =  Accuracy(task="multiclass", num_classes=5)
      acc = accuracy(outputs.round(), targets.int())
      return loss, acc

  def predict_step(self, batch, batch_idx, dataloader_idx=0):
    """ Compute prediction for the given batch of data """
    x, y = batch
    y_hat = self(x)
    return y_hat

In [58]:
from pytorch_lightning import Trainer

In [61]:
import lightning as L

In [84]:
# setup model training
num_epochs = 10
data_module = SampleDataModule()
model = SampleModel()
trainer = Trainer(max_epochs=num_epochs)

INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [85]:
# training a model
trainer.fit(model, data_module)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type       | Params
-----------------------------------------
0 | layers    | Sequential | 77    
1 | criterion | BCELoss    | 0     
-----------------------------------------
77        Trainable params
0         Non-trainable params
77        Total params
0.000     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [86]:
# evaluate model on test set
result =trainer.test(model, data_module)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

In [87]:
# prediction using trained model
model.eval()
preds = model(torch.Tensor(affiliation[:5])).round()
for label, pred in zip(labels[:5], preds):
  print(f"label: {label}, prediction: {bool(pred[0])}")

label: True, prediction: True
label: True, prediction: True
label: True, prediction: True
label: False, prediction: True
label: True, prediction: True
