In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
#1 Use torch.randn to create two tensors of size (29, 30, 32) and (32, 100) .
a = torch.randn(29, 30, 32)
b = torch.randn(32,100)
a.shape, b.shape

In [None]:
# 2 Use torch.matmul to matrix multiply the two tensors.
torch.matmul
print("torch.matmul")
c = torch.matmul(a,b)
print(c.size())
# 4 Use torch.sum on the resulting tensor, passing the optional argument of dim=1 to sum across the 1st dimension. Before you run this, can you predict the size?
d = torch.sum(c, dim=1) #dim=1 is sum across the rows of matrix
d.size()

torch.mm
print("torch.mm")
mat_1 = torch.tensor([[1, 2, 3],
                      [4, 3, 8],
                      [1, 7, 2]])
  
mat_2 = torch.tensor([[2, 4, 1],
                      [1, 3, 6],
                      [2, 6, 5]])
c = torch.mm(mat_1, mat_2,out=None)
print(c.size())
d = torch.sum(c, dim=1) #dim=1 is sum across the rows of matrix
d.size()

torch.bmm
print("torch.bmm")
mat_1 = torch.randn(2, 3, 3)
mat_2 = torch.randn(2, 3, 4)
c = torch.bmm(mat_1, mat_2) #in batched matrix-matrix multiplication bmm(input, batch2,*) batch2 must be a 3d-tensor
print(c.size())
d = torch.sum(c, dim=1) #dim=1 is sum across the rows of matrix
d.size()

#torch.einsum
print("torch.einsum")
c = torch.einsum('ii',torch.randn(3,3))
c

torch.matmul
print("torch.matmul")
c = torch.matmul(a,b)
print(c.size())
# 4 Use torch.sum on the resulting tensor, passing the optional argument of dim=1 to sum across the 1st dimension. Before you run this, can you predict the size?
d = torch.sum(c, dim=1) #dim=1 is sum across the rows of matrix
d.size()



**3 What is the difference between torch.matmul , torch.mm , torch.bmm , and torch.einsum , and the @ operator?**

**torch.mm:**
torch.mm computes matrix multiplication by taking an m×n Tensor and an n×p Tensor. It can deal with only two-dimensional matrices and not with single-dimensional ones. This function does not support broadcasting. Broadcasting is nothing but the way the Tensors are treated when their shapes are different. The smaller Tensor is broadcasted to suit the shape of the wider or larger Tensor for operations.

**torch.bmm:**
This method provides batched matrix multiplication for the cases where both the matrices to be multiplied are of only 3-Dimensions (x×y×z) and the first dimension (x) of both the matrices must be same. This does not support broadcasting. The “deterministic” parameter takes up boolean value. A ‘false‘ does a faster calculation which is non-deterministic. A ‘true‘ does a slower calculation however, it is deterministic.

**torch.matmul:**
This method allows the computation of multiplication of two vector matrices (single-dimensional matrices), 2D matrices and mixed ones also. This method also supports broadcasting and batch operations. Depending upon the input matrices dimensions, the operation to be done is decided. The general syntax is given below.

**torch.einsum:**
Sums the product of the elements of the input operands along dimensions specified using a notation based on the Einstein summation convention.

**@operator:**
The @ – Simon H operator, when applied on matrices performs multiplication element-wise on 1D matrices and normal matrix multiplication on 2D matrices. If both the matrices have the same dimension, then the matrix multiplication is carried out normally without any broadcasting/prepending.  If any one of the matrices is of a different dimension, then appropriate broadcasting is carried out first and then the multiplication is carried out. This operator applies to N-Dimensional matrices also.


In [None]:
#5 Create a new long tensor of size (3, 10).
x = torch.ones(3,10, dtype=torch.int64)

#6 Use this new long tensor to index into the tensor from step 2.
y=c[x]
y.shape
torch.equal(y[0][0], y[1][1])

In [None]:
#7 Use torch.mean to average across the last dimension in the tensor from step 6.
torch.mean(c[x], dim=3), torch.mean(c[x], dim=3).shape

In [None]:
# Write a pure PyTorch program to compute the value of √2 up to 4 decimal places without using the square root or other math functions from any of the libraries.
def squareroot(x):
    re = x
    precision = 10 ** (-10)
    while abs(x - re * re) > precision:
        re = (re + x / re) / 2
        a = torch.tensor(re, dtype=torch.float16)
        a= a.item()
    return round(a,4)

squareroot(4)

#Fail-fast exercises

In [None]:
#Importing all the libs
import os
import urllib
import zipfile
import pandas as pd
import torch.nn as nn
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random

In [None]:
#downloading and extracting Glove embeddings
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_dir = os.path.join(os.path.curdir, "glove")
if not os.path.exists(os.path.join(os.path.curdir, "glove", "glove.6B.300d.txt")):
    zip_path, _ = urllib.request.urlretrieve(glove_url)
    with zipfile.ZipFile(zip_path, "r") as f:
        f.extractall(glove_dir)

#Creating the Glove embedding dictionary
embeddings_dict = {}
with open("/content/glove/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
#Some hyperparameters
batch_size = 32
num_features = 5000
embedding_size = 16
max_len = 50

In [None]:
#IMDB data frame considered only first 512 columns of data
df = pd.read_csv("/content/drive/MyDrive/NLP243/243_HW1/data/hw1_train-1.csv")
df=df.head(512)

In [None]:
class data_embd(Dataset):
  def __init__(self, data: pd.DataFrame, embeddings_dict, max_len):
    self.data = data
    self.embds = embeddings_dict
    self.default =  self.embds['the']
    self.glove_dim = 300
    self.max_len = max_len

  def tokenize(self, text: str):
    return [i for i in text.split()]

  def encode_tokens(self, tokens):
    encoded = [torch.tensor(embeddings_dict.get(token, self.default)) for token in tokens]
    encoded += [torch.zeros(self.glove_dim) for _ in range(self.max_len-len(tokens))]
    return encoded
  
  def __getitem__(self, n: int):
    textstr = self.data['textstr '].iloc[n]
    return self.encode_tokens(self.tokenize(textstr))

  def __len__(self):
    return len(self.data)


df_a = data_embd(df, embeddings_dict, max_len=max_len)
train_loader = DataLoader(df_a, batch_size=512, shuffle=False)

In [None]:
#question1, question3
class DeepAveragingNetwork(nn.Module):
  def __init__(self, max_len):
    super().__init__()
    self.max_len=max_len
    self.glove_dim = 300
    # self.glove = GloVe(name='6B', dim=glove_dim)

  def forward(self,embeddings):
    ems = [embeddings[i] for i in range(0, len(embeddings))]
    deepavg = torch.mean(torch.stack(ems), dim=0)
    return deepavg
    
model = DeepAveragingNetwork(max_len).to(device)
pbar = tqdm(train_loader)
for embds in pbar:
  op = model(tuple(embds))
print("\nfinal shape when 512 sentences are inferences on GPU",op.shape)

In [None]:
#question2
#using sci-kit libraries for dimensionality reduction using PCA(principle component Analysis)
from sklearn.decomposition import PCA
# Initializing PCA object with number of components as (50)
pca = PCA(n_components=50)
# Fitting the PCA model to data from deep average network
pca.fit(op)
# Applying tranform method for dimensionality reduction of the data
reduced_data = pca.transform(op)
# reduced_data.shape

print(f'original dimensions of the data: {op.shape} | After PCA: {reduced_data.shape}')

In [None]:
#question4
class MutliEmbedding(nn.Module):
  def __init__(self,num_emb:int, size_emb1:int, size_emb2:int):
    super(MutliEmbedding, self).__init__()
    self.num_emb = num_emb
    self.indices1 = size_emb1
    self.indices2 = size_emb2

  def forward(self,ind1,ind2):
    op = torch.cat((ind1,ind2),1).unsqueeze(0)
    return op
    

num_emb =1
size_emb1=300
size_emb2=300
multiemb = MutliEmbedding(num_emb, size_emb1, size_emb2)
indices1= torch.tensor(embeddings_dict['green']).unsqueeze(0)
indices2= torch.tensor(embeddings_dict['apple']).unsqueeze(0)
print(indices1.shape)
print(indices2.shape)
final=multiemb(indices1, indices2)
final.shape

In [None]:
#question5
from collections import Counter
class DummySentenceLabelDataset(Dataset):
  def __init__(self, num_sentences, max_len):
    self.num_sentences = num_sentences
    self.max_len = max_len
    # self.default =  self.embds['the']
    self.vocab = ['the', 'in', 'who', 'cast', 'is', 'of', 'actor', 'movie', 'life', 'beautiful', \
                  'for', 'played', 'and', 'crew', 'was', 'show', 'campaign', 'on', 'female', 'i', 'me', 'july', \
                  'plays', 'star', 'find', 'man', 'starred', 'were', 'stars', 'what', 'actors', 'can', 'captain', 'america',\
                  'credits', 'from', 'lead', 'charlie', 'are', 'see', 'list', 'rocky', 'members', 'apollo', \
                  'thirteen', 'luke', 'wars', 'new', 'hope', 'godfather']

    self.count = Counter(self.vocab)  
    self.tokens, self.counts = zip(*self.count.most_common(len(self.vocab)))
    self.embeddings_dict = {token: idx for idx, token in enumerate(self.tokens)}
    self.default = self.embeddings_dict['the']   
    
  def tokenize(self, text: str):
    return [i for i in text.split()]

  def print_this(self):
    print(self.embeddings_dict)    

  def encode_tokens(self, tokens):
    encoded = [torch.tensor([self.embeddings_dict.get(token, self.default)], dtype=torch.float32) for token in tokens]
    encoded += [torch.zeros(1) for _ in range(self.max_len-len(tokens))]
    return encoded
  
  def __getitem__(self, n:int):
    sentence = [random.choice(self.vocab) for _ in range(random.randint(1, self.max_len))]
    label = random.randint(0, 1)
    return self.encode_tokens(sentence), torch.tensor(float(label))
    
  def __len__(self):
    return self.num_sentences 

max_len = 20
model = DeepAveragingNetwork(max_len)
dataset = DummySentenceLabelDataset(num_sentences=10, max_len=max_len)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)
dataset.print_this()

# let's measure the error rate for one epoch
error = 0.0
for sentence, label in (dataloader):
  print(sentence)
  prediction = model(sentence)
  error += abs(prediction - label)
print(f'error rate: {error/len(dataset)}')