In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np

In [2]:
class Attention(nn.Module):
    def __init__(self, dimension):
        super(Attention, self).__init__()

        self.u = nn.Linear(dimension, dimension)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(1)

    def forward(self, h):
        # h : Batch * timestep * dimension
        #         print('h', h.shape)
        x = self.u(h)
        # u(h) : Batch * timestep * att_dim
        # print('u(h)', x)

        # tan(x) : Batch * timestep * att_dim
        x = self.tanh(x)
        # print('tanh(x)', x)

        # softmax(x) : Batch * timestep * att_dim
        x = self.softmax(x)
        # print(x)
        # print('softmax(h)', x.shape,  h.shape)
        # Batch matrix multiplication
        output = x * h
        #         print('output ', output.shape)
        output = torch.sum(output, dim=1)
        #         print('output ', output.shape)
        return output



In [3]:
class AttentionModel(torch.nn.Module):
    def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, embedding_matrix):
        super(AttentionModel, self).__init__()

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
        et = torch.tensor(embedding_matrix, dtype=torch.float32)
        self.word_embeddings.weights = nn.Parameter(et, requires_grad=False)
        self.lstm = nn.LSTM(embedding_length, hidden_size=hidden_size, batch_first=True,
                            dropout=0.5, num_layers=2, bidirectional=True)
        self.label = nn.Linear(hidden_size * 2, output_size)
        self.attn_module = Attention(hidden_size * 2)



    def forward(self, input_sentences, batch_size=None):

        input = self.word_embeddings(input_sentences)
        output, (final_hidden_state, final_cell_state) = self.lstm(input)  
        attn_output = self.attn_module(output)
        logits = self.label(attn_output)
        return logits

In [4]:
from sklearn import metrics
import os.path


In [5]:
def getResult(y_test, y_pred):

    if torch.is_tensor(y_test) and torch.is_tensor(y_pred):
        y_test, y_pred = cudaTocpu(y_test, y_pred)
    
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    true = report['1']
    fake = report['0']

    overall = {"Accuracy": metrics.accuracy_score(y_test, y_pred), "recall": metrics.recall_score(y_test, y_pred),
               "f1-score": metrics.f1_score(y_test, y_pred), "precision": metrics.precision_score(y_test, y_pred) }

    return true, fake, overall



In [6]:
def printResult(experiment, overall, fake):
    experiment = experiment.ljust(14)
    res = "{}     {:.2f}         {:.2f}        {:.2f}      #  {:.2f}         {:.2f}         {:.2f}".format(experiment,overall['precision'],overall['recall'],overall['f1-score'],fake['precision'],fake['recall'],fake['f1-score'])
    return res


In [8]:
def getReport(y_test,y_pred):

    if torch.is_tensor(y_test) and torch.is_tensor(y_pred):
        y_test, y_pred = cudaTocpu(y_test, y_pred)
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print("Precision:", metrics.precision_score(y_test, y_pred))
    print("Recall:", metrics.recall_score(y_test, y_pred))
    print("F1-Score:", metrics.f1_score(y_test, y_pred))
    print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))


In [9]:
def saveResults(path,res):
    if os.path.isdir('./results') == False:
        os.mkdir('results')
    path = 'results/'+path
    if os.path.exists('./'+path)==False:
        with open(path, 'w', encoding="utf8") as file:
            file.write("                                Overall               #               Fake                \n")
            file.write("                   precision    recall      f1-score  #  precision    recall      f1-score\n")
            file.write(res+"\n")
    else:
        with open(path, 'a', encoding="utf8") as file:
            file.write(res+"\n")


In [10]:
def cudaTocpu(y_test,y_pred):
    y_test = [ y.cpu() if y.is_cuda else y for y in y_test ]
    y_pred = [ y.cpu() if y.is_cuda else y for y in y_pred ]

    return y_test, y_pred

In [11]:

learning_rater = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300
epochs = 3

in_channels = 1
out_channels = 256
kernel_heights = [1, 2, 3, 4]
stride = 1
padding = 0
keep_probab = 0.8

In [12]:
EMBEDDING_300 = '/content/drive/MyDrive/cc.bn.300.vec'

In [13]:
DATA_PATH = "/content/drive/MyDrive/TrainTest_puncremove (1).csv"

In [14]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import sys
from keras_preprocessing import sequence,text

import string, re
from torch.utils.data import TensorDataset, DataLoader

In [15]:
def load_dataset(test_sen=None):


    EMBEDDING_FILE = EMBEDDING_300

    df = pd.read_csv(DATA_PATH)
    X = df["content"].values
    Y = df["label"].values
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=109)

    # data preprocessing
    print(X[0])
    puncList = ["।", "”", "“", "’"]
    x = "".join(puncList)
    filterString = x + '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n০১২৩৪৫৬৭৮৯'
    tokenizer = text.Tokenizer(num_words=50000, filters=filterString, lower=False,)
    tokenizer.fit_on_texts(x_train)
    train_idx = tokenizer.texts_to_sequences(x_train)
    test_idx = tokenizer.texts_to_sequences(x_test)
    word_index = tokenizer.word_index

    embeddings_index = {}
    for i, line in enumerate(open(EMBEDDING_FILE, encoding="utf-8")):
        val = line.split()
        embeddings_index[val[0]] = np.asarray(val[1:], dtype='float32')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    x_train = sequence.pad_sequences(train_idx, maxlen=32, padding='post', truncating='post')
    x_test = sequence.pad_sequences(test_idx, maxlen=32, padding='post', truncating='post')

    test_size = len(x_test)

    dev_size = (int)(test_size * 0.1)

    x_dev = x_test[:dev_size]
    x_test = x_test[dev_size:]
    y_dev = y_test[:dev_size]
    y_test = y_test[dev_size:]

    x_train = torch.tensor(x_train, dtype=torch.long)
    y_train = torch.tensor(y_train, dtype=torch.float32)

    train = TensorDataset(x_train, y_train)
    train_iter = DataLoader(train, batch_size=32)

    x_test = torch.tensor(x_test, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    test = TensorDataset(x_test, y_test)
    test_iter = DataLoader(test, batch_size=32)

    x_dev = torch.tensor(x_dev, dtype=torch.long)
    y_dev = torch.tensor(y_dev, dtype=torch.float32)

    valid = TensorDataset(x_dev, y_dev)
    valid_iter = DataLoader(valid, batch_size=32)
    word_embeddings = embedding_matrix
    vocab_size = 50000


    return vocab_size, word_embeddings, train_iter, valid_iter, test_iter

In [16]:
import time
from torch.autograd import Variable
import torch.optim as optim
from sklearn import metrics
import argparse

In [17]:
checkpoint_history = []
early_stop_monitor_vals = []
best_score = 0

In [18]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)


In [19]:
def train_model(model, train_iter, epoch, loss_fn):
    total_epoch_loss = 0
    total_epoch_acc = 0
    if torch.cuda.is_available():
        model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text, target = batch
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * (num_corrects/len(target))
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)


In [20]:
def eval_model(model, val_iter, loss_fn):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    y_pred = []
    y_test = []
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text, target = batch
            if (text.size()[0] is not 32):
                continue
            # target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)

            y_test.extend(target.data)
            y_pred.extend(torch.max(prediction, 1)[1].view(target.size()).data)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(target)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter), y_test, y_pred




In [21]:
def checkpoint_model(model_to_save, path_to_save, current_score, epoch, model_name, mode='max'):
    """
    Checkpoints models state after each epoch.
    :param model_to_save:
    :param optimizer_to_save:
    :param path_to_save:
    :param current_score:
    :param epoch:
    :param n_epoch:
    :param mode:
    :return:
    """


    model_state = {'epoch': epoch + 1,
                   'model_state': model_to_save.state_dict(),
                   'score': current_score,
                   }

    # Save the model as a regular checkpoint
    # torch.save(model_state, path_to_save + 'last.pth'.format(epoch))

    checkpoint_history.append(current_score)
    is_best = False

    # If the model is best so far according to the score, save as the best model state
    if ((np.max(checkpoint_history) == current_score and mode == 'max') or
            (np.min(checkpoint_history) == current_score and mode == 'min')):
        is_best = True
        best_score = current_score
        torch.save(model_state, path_to_save + '{}_best.pth'.format(model_name))

    print('Current best', max(checkpoint_history), 'after epoch {}'.format(epoch))

    return is_best



In [22]:
def load_saved_model(model, path):
    """
    Load a saved model from dump
    :return:
    """
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state'])






In [29]:
def run_model(model_name):

    
    vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_dataset()
    learning_rate = learning_rater
    batch_size = 32
    output_size = 2
    hidden_size = 256
    embedding_length = 300
    epochs = 3

    # in_channels = in_channels
    # out_channels = out_channels
    # kernel_heights = kernel_heights
    # stride = stride
    # padding = padding
    # keep_probab = keep_probab


    if model_name == 'CNN':
        model = CNN(batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, word_embeddings)

    elif model_name == 'LSTM':
        model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)



    loss_fn = F.cross_entropy
    path = "/content/drive/MyDrive/HandWritten/"
    for epoch in range(epochs):
        train_loss, train_acc = train_model(model, train_iter, epoch, loss_fn)
        val_loss, val_acc, y_test, y_pred = eval_model(model, valid_iter, loss_fn)
        _, f, o = getResult(y_test, y_pred)
        current_f1 = f['f1-score']
        checkpoint_model(model, path, current_f1, epoch+1, model_name, 'max')
        print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')

    
    load_saved_model(model, path + '{}_best.pth'.format(model_name))
    test_loss, test_acc, y_test, y_pred = eval_model(model, test_iter, loss_fn)
    print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

    print("                                Overall               #               Fake                ")
    print("                   precision    recall      f1-score  #  precision    recall      f1-score")
    _, f, o = getResult(y_test, y_pred)
    res = printResult(model_name,o,f)
    print(res)
    path = model_name+"_results.txt"
    saveResults(path, res)



In [30]:
def lstm(args):
    run_model('LSTM')

In [31]:

parser = argparse.ArgumentParser(description='Argparse!')
subparsers = parser.add_subparsers()



parser_q = subparsers.add_parser('LSTM')
parser_q.set_defaults(func=lstm)


parser.add_argument("-g","--gpu", action="store_true")
args = parser.parse_args()
gpu = args.gpu
if not gpu:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

args.func(args)

usage: ipykernel_launcher.py [-h] [-g] {LSTM} ...
ipykernel_launcher.py: error: invalid choice: '/root/.local/share/jupyter/runtime/kernel-edec1152-3c3f-41ec-8f9e-51c1fbff2151.json' (choose from 'LSTM')


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
run_model('LSTM')

ঢাকা আগামী ২৪ ঘণ্টায় গাইবান্ধা বগুড়া সিরাজগঞ্জ জামালপুর ও টাঙ্গাইলের বন্যা পরিস্থিতি উন্নতির সম্ভাবনা রয়েছে ব্রহ্মপুত্র ও গঙ্গার পানি কমে যাওয়া অব্যাহত থাকায় এই পূর্বাভাস দিয়েছে বাংলাদেশ পানি উন্নয়ন বোর্ডের বন্যা পূর্বাভাস ও সতর্কীকরণ কেন্দ্র পূর্বাভাসে জানানো হয় বৃহস্পতিবার ২০ সেপ্টেম্বর সকাল ৯টায় তিনটি নদীর পাঁচটি পয়েন্টে পানি বিপদসীমার ওপর দিয়ে প্রবাহিত হয় যমুনা নদীর ফুলছড়ি পয়েন্টে ৯ সেন্টিমিটার সারিয়াকান্দিতে ২১ সেন্টিমিটার ও বাহাদুরাবাদ পয়েন্টে পাঁচ সেন্টিমিটার ওপর দিয়ে পানি প্রবাহিত হতে দেখা যায় এছাড়া আত্রাই নদীর বাঘাবাড়ী পয়েন্টে ১৩ সেন্টিমিটার ও ধলেশ্বরী নদীর এলাশিন পয়েন্টে ৩৩ সেন্টিমিটার ওপর দিয়ে পানি প্রবাহিত হতে দেখা যায় এদিকে আপার মেঘনা অববাহিকার প্রধান নদীগুলোর পানি সমতল হ্রাস পাচ্ছে যা পরবর্তী ৪৮ ঘণ্টা অব্যাহত থাকবে বলে জানানো হয়েছে পদ্মা ও যমুনার পানিও স্থিতিশীল রয়েছে গত ২৪ ঘণ্টায় ভারতে বৃষ্টিপাতের কোনো রেকর্ড নেই বলেও পূর্বাভাস রিপোর্টে উল্লেখ করা হয়েছে
Epoch: 1, Idx: 100, Training Loss: 0.2625, Training Accuracy:  90.62%
Current best 0.7567567567567567 after epoch 1
Epo