<a href="https://colab.research.google.com/github/sanchit45/Practice/blob/main/Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import random
import re
import csv
from collections import Counter
from functools import partial
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF
from sklearn.metrics import classification_report
from tqdm import tqdm, tqdm_notebook

# PyTorch modules
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
# nltk text processors
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer

%matplotlib inline
%config InlineBackend.figure_formats = ['svg']
plt.style.use('ggplot')
tqdm.pandas()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


device(type='cpu')

In [79]:


class FeedfowardTextClassifier(nn.Module):
    def __init__(self, device, vocab_size, hidden1, hidden2, num_labels, batch_size):
        super(FeedfowardTextClassifier, self).__init__()
        self.device = device
        self.batch_size = batch_size
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, num_labels)

    def forward(self, x):
        batch_size = len(x)
        if batch_size != self.batch_size:
            self.batch_size = batch_size
        x = torch.FloatTensor(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.sigmoid(self.fc3(x))



In [80]:
model=torch.load("/content/drive/MyDrive/bow.pth")

token_idx_mapping=model["token2idx"]

bow_model = FeedfowardTextClassifier(
    vocab_size=len(token_idx_mapping),
    hidden1=100,
    hidden2=50,
    num_labels=2,
    device=device,
    batch_size=528,
)


bow_model.load_state_dict(model["state_dict"])
token_idx_mapping=model["token2idx"]
idx_token_mapping=model["index2token"]
print(bow_model)
print(model["state_dict"])




FeedfowardTextClassifier(
  (fc1): Linear(in_features=1001, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=2, bias=True)
)
OrderedDict([('fc1.weight', tensor([[ 0.0641,  0.0777,  0.0736,  ..., -0.0120,  0.0778,  0.0525],
        [ 0.0108,  0.0332,  0.0205,  ...,  0.0196,  0.0811,  0.0330],
        [ 0.0385,  0.0363,  0.0224,  ..., -0.0252,  0.0674,  0.0468],
        ...,
        [-0.0169,  0.0522, -0.0166,  ...,  0.0073, -0.1093, -0.0380],
        [ 0.0412,  0.0564,  0.0108,  ...,  0.0010,  0.0973,  0.0652],
        [ 0.0522,  0.0629,  0.0614,  ...,  0.0130,  0.0807,  0.0153]])), ('fc1.bias', tensor([ 0.1516, -0.0235, -0.0126,  0.1621,  0.1890, -0.0100,  0.0177, -0.0038,
         0.1844,  0.0191,  0.1438, -0.0268,  0.1655,  0.1424, -0.0088,  0.1266,
        -0.0149,  0.1494, -0.0268, -0.0415,  0.1577,  0.1955, -0.0312,  0.1441,
         0.0134,  0.1635, -0.0129, -0.0006, -0.0392,  0.1374,  0.0815, -

In [81]:
tfidf_model = FeedfowardTextClassifier(
    vocab_size=len(token_idx_mapping),
    hidden1=100,
    hidden2=50,
    num_labels=2,
    device=device,
    batch_size=528,
)
tfidf_model

FeedfowardTextClassifier(
  (fc1): Linear(in_features=1001, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=2, bias=True)
)

In [82]:
tfidf=torch.load("/content/drive/MyDrive/tfidf.pth")
tfidf_model.load_state_dict(tfidf["state_dict"])
print(tfidf_model)

FeedfowardTextClassifier(
  (fc1): Linear(in_features=1001, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=2, bias=True)
)


In [83]:
def build_vocab(corpus):
    vocab = {}
    for token in corpus:
        if token not in vocab.keys():
              vocab[token] = len(vocab)
    return vocab

def build_index2token(vocab):
    index2token = {}
    for token in vocab.keys():
        index2token[vocab[token]] = token
    return index2token

def tokenize(text, stop_words, lemmatizer):
    text = re.sub(r'[^\w\s]', '', text) # remove special characters
    text = text.lower() # lowercase
    tokens = wordpunct_tokenize(text) # tokenize
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # noun lemmatizer
    tokens = [lemmatizer.lemmatize(token, "v") for token in tokens] # verb lemmatizer
    tokens = [token for token in tokens if token not in stop_words] # remove stopwords
    return tokens

def build_bow_vector(sequence, idx2token):
    vector = [0] * len(idx2token)
    for token_idx in sequence:
        if token_idx not in idx2token:
            raise ValueError('Wrong sequence index found!')
        else:
            vector[token_idx] += 1
    return vector

def preprocess(data,token_idx_mapping,idx_token_mapping,feature):

        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        tokens=tokenize(data, stop_words, lemmatizer)
        vocab=build_vocab(tokens)
        index2token=build_index2token(vocab)
        doc=list(vocab.keys())
        sequence=[token_idx_mapping[token] for token in doc if token in token_idx_mapping]
        idx2token=idx_token_mapping
        bow_vector=build_bow_vector(sequence, idx2token)
        """vectorizer = TfidfVectorizer(
            analyzer='word',
            tokenizer=lambda doc: doc,
            preprocessor=lambda doc: doc,
            token_pattern=None,
        )
        tfidf_vectors = vectorizer.fit_transform(tokens).toarray()
        tfidf_vector = [vector.tolist() for vector in tfidf_vectors]"""
        vectorizer = TfidfVectorizer()

        # Fit and transform the input text to obtain TF-IDF vectors
        tfidf_vectors = vectorizer.fit_transform(tokens).toarray()


        # Convert the TF-IDF vectors to a list of lists
        tfidf_vector = tfidf_vectors.tolist()
        if feature =="bow":
            return bow_vector
        elif feature=="tfidf":
            #return tfidf_vector
            print(tfidf_vector)

In [84]:
data="Quora is a place to gain and share knowledge. It's a platform to ask questions and connect with people who contribute unique insights and quality answers "

In [85]:
preprocess(data,token_idx_mapping,idx_token_mapping,feature="bow")

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


#use BOw model(just below this text cell) ,few input issues in tfidf

In [86]:
bow_model.eval()
bow_model.to(device)
y_pred = []


with torch.no_grad():

        inputs = preprocess(data,token_idx_mapping,idx_token_mapping,feature="bow")
        probs = bow_model([inputs])


        probs = probs.detach().cpu().numpy()
        predictions = np.argmax(probs, axis=1)
        y_pred.extend(predictions)


print(y_pred)

[0]


In [None]:
# working on this one
tfidf_model.eval()
tfidf_model.to(device)
y_pred = []


with torch.no_grad():

        inputs = preprocess(data,token_idx_mapping,idx_token_mapping,feature="tfidf")
        probs = tfidf_model([inputs])


        probs = probs.detach().cpu().numpy()
        predictions = np.argmax(probs, axis=1)
        y_pred.extend(predictions)


print(y_pred)

  (0, 16)	0.39028945195301484
  (0, 8)	0.455453973865905
  (0, 15)	0.657873467373268
  (0, 11)	0.455453973865905
  (1, 7)	0.20259317165969928
  (1, 14)	0.40518634331939857
  (1, 17)	0.20259317165969928
  (1, 4)	0.20259317165969928
  (1, 0)	0.40518634331939857
  (1, 12)	0.40518634331939857
  (1, 5)	0.17104307638950533
  (1, 9)	0.17104307638950533
  (1, 13)	0.20259317165969928
  (1, 1)	0.34208615277901067
  (1, 16)	0.1465709212673861
  (1, 8)	0.34208615277901067
  (1, 11)	0.17104307638950533
  (2, 7)	0.21211528370190072
  (2, 14)	0.42423056740380144
  (2, 17)	0.21211528370190072
  (2, 4)	0.21211528370190072
  (2, 0)	0.42423056740380144
  (2, 12)	0.42423056740380144
  (2, 5)	0.17908229767263645
  (2, 9)	0.17908229767263645
  (2, 13)	0.21211528370190072
  (2, 1)	0.17908229767263645
  (2, 16)	0.15345992311775974
  (2, 8)	0.3581645953452729
  (2, 11)	0.17908229767263645
  (3, 6)	0.657873467373268
  (3, 5)	0.455453973865905
  (3, 1)	0.455453973865905
  (3, 16)	0.39028945195301484
  (4, 10)	0.

TypeError: ignored