In [61]:
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, random_split, Subset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Tuple

In [55]:
torch.manual_seed(32451365)

<torch._C.Generator at 0x21de4963490>

In [49]:
class SimpleNN(nn.Module):
    def __init__(self, in_values, out_values):
        super().__init__()
        self.dense1 = nn.Linear(in_values, 12673)
        self.drop1 = nn.Dropout()
        self.dense2 = nn.Linear(12673, 4000)
        self.drop2 = nn.Dropout()
        self.dense3 = nn.Linear(4000, 500)
        self.drop3 = nn.Dropout()
        self.last_dense = nn.Linear(500, out_values)
        
    def forward(self, x):
        x = F.relu(self.dense1(x))
        x = self.drop1(x)
        x = F.relu(self.dense2(x))
        x = self.drop2(x)
        x = F.relu(self.dense3(x)) 
        x = self.drop3(x) 
        x = self.last_dense(x)
        return x

In [50]:
class CommentDataset(Dataset):
    def __init__(self, csv_name):
        df = pd.read_csv(csv_name)
        
        self.labels = sorted(df.y.unique().tolist())
        X = TfidfVectorizer().fit_transform(df.x.values)
        y = df.y.apply(lambda x: self.labels.index(x)).values
        
        self.X = torch.from_numpy(X.toarray()).to_sparse()
        self.y = torch.from_numpy(y)
        
    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __repr__(self):
        fmt_str = ["Comment dataset for sentiment analysis."]
        fmt_str.append(f"Number of comments: {self.__len__()}")
        fmt_str.append(f"Number of words: {self.X.shape[1]}")
        fmt_str.append(f"Labels: {self.labels}")
        return '\n'.join(fmt_str)

In [51]:
ds = CommentDataset("../data/processed/comments_clean.csv")
print(ds)

Comment dataset for sentiment analysis.
Number of comments: 1617
Number of words: 5728
Labels: ['Negative', 'Positive']


In [52]:
text, lab = ds[0]
print(lab)
print(text.shape)

tensor(1)
torch.Size([5728])


In [57]:
def split_dataset(ds: Dataset,
                  train_size: float = 0.8) -> Tuple[Subset, Subset]:
    """Function taking an image folder and splitting it in a train and test set.
    To achieve random results, we recommend to set up the seed for
    reproductibility with `torch.manual_seed(seed)`.
    The splitting ratio can be the number of samples (int) or the dataset
    proportion (flaot between 0 and 1).
    Args:
        ds (Dataset): The dataset to be split
        train_size (float, optional): The number of samples or the
        proportion of the dataset. Defaults to 0.9.
    Returns:
        Tuple[Subset, Subset]: The training and validation subsets.
    """

    if isinstance(train_size, float):
        train_size = int(len(ds)*train_size)
    train_ds, val_ds = random_split(ds, [train_size, len(ds)-train_size])
    return train_ds, val_ds

In [62]:
train_ds, test_ds = split_dataset(ds)
len(train_ds), len(test_ds)

(1293, 324)

In [63]:
32 * 2 * 2 * 2

256

In [67]:
train_dl = DataLoader(train_ds, batch_size=256)
test_dl = DataLoader(test_ds, batch_size=64)
len(train_dl),len(test_dl)

(21, 6)

In [42]:
model = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

TypeError: __init__() missing 2 required positional arguments: 'in_values' and 'out_values'

In [None]:
model = Sequential()

model.add(Dense(units=12673,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=4000,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=500,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=3, activation='softmax'))

opt=tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)



In [2]:
12673 / 2

6336.5

In [None]:
model.fit(x=X_train, y=y_train, batch_size=256, epochs=100, validation_data=(X_test, y_test), verbose=1, callbacks=early_stop)