<a href="https://colab.research.google.com/github/praj9719/ref_repo/blob/main/Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('stopwords')  
nltk.download('punkt')
!pip install num2words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting num2words
[?25l  Downloading https://files.pythonhosted.org/packages/eb/a2/ea800689730732e27711c41beed4b2a129b34974435bdc450377ec407738/num2words-0.5.10-py3-none-any.whl (101kB)
[K     |████████████████████████████████| 102kB 4.9MB/s 
Installing collected packages: num2words
Successfully installed num2words-0.5.10


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import numpy as np
import math
import json

In [None]:
class PreProcessor:

    def __init__(self, data):
        self.data = data;

    def execute(self):
        self.convert_lower_case()
        self.remove_punctuation() #remove comma seperately
        self.remove_apostrophe()
        self.remove_stop_words()
        self.convert_numbers()
        self.stemming()
        self.remove_punctuation()
        self.convert_numbers()
        self.stemming() #needed again as we need to stem the words
        self.remove_punctuation() #needed again as num2word is giving few hypens and commas fourty-one
        self.remove_stop_words() #needed again as num2word is giving stop words 101 - one hundred and one        
        return self.data

    def convert_lower_case(self):
        self.data = np.char.lower(self.data)

    def remove_stop_words(self):
        stop_words = stopwords.words('english')
        words = word_tokenize(str(self.data))
        new_text = ""
        for w in words:
            if w not in stop_words and len(w) > 1:
                new_text = new_text + " " + w
        self.data = new_text
    
    def remove_punctuation(self):
        symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
        for i in range(len(symbols)):
            data = np.char.replace(self.data, symbols[i], ' ')
            data = np.char.replace(data, "  ", " ")
        data = np.char.replace(data, ',', '')
        self.data = data

    def remove_apostrophe(self):
        self.data = np.char.replace(self.data, "'", "")

    def stemming(self):
        stemmer= PorterStemmer()
        
        tokens = word_tokenize(str(self.data))
        new_text = ""
        for w in tokens:
            new_text = new_text + " " + stemmer.stem(w)
        self.data = new_text

    def convert_numbers(self):
        tokens = word_tokenize(str(self.data))
        new_text = ""
        for w in tokens:
            try:
                w = num2words(int(w))
            except:
                a = 0
            new_text = new_text + " " + w
        new_text = np.char.replace(new_text, "-", " ")
        self.data = new_text

In [None]:
PreProcessor("what is newtons second law of motion").execute()

' newtron second law motion'

In [None]:
class Doc2Vec:

    def __init__(self, data_path):
        self.root = data_path
        self.topics = []
        self.texts = []
        self.extract()
        self.alpha = 0.3
        self.N = len(self.texts)
        print(f'[Info] alpha: {self.alpha} N: {self.N}')

    def extract(self):
        print(f'[Info] root: {self.root}')
        files = os.listdir(self.root)
        for file in files:
            self.topics.append(file[:-4])
            with open(self.root + file, 'r') as f:
                self.texts.append(f.read())
        print(f'[Info] total topics: {len(self.topics)}')

    def execute(self):
        self.pre_process()
        self.caculate_df()
        self.calculate_tf_idf()
        self.calculate_tf_idf_title()
        for i in self.tf_idf:
            self.tf_idf[i] *= self.alpha
        # for i in self.tf_idf_title:
        #     self.tf_idf[i] = self.tf_idf_title[i]
        self.D = np.zeros((self.N, self.total_vocab_size))
        for i in self.tf_idf:
            try:
                ind = self.total_vocab.index(i[1])
                self.D[i[0]][ind] = self.tf_idf[i]
            except:
                pass
        print(f'[Info] DocVector Shape: {self.D.shape}')
        
    def pre_process(self):
        self.processed_text = []
        self.processed_title = []
        for i in range(self.N):
            self.processed_text.append(word_tokenize(str(PreProcessor(self.texts[i]).execute())))
            self.processed_title.append(word_tokenize(str(PreProcessor(self.topics[i]).execute())))
        print(f'[Info] Processed Titles: {len(self.processed_title)} Processed Texts: {len(self.processed_text)}')

    def caculate_df(self):
        self.DF = {}
        for i in range(self.N):
            tokens = self.processed_text[i]
            for w in tokens:
                try:
                    self.DF[w].add(i)
                except:
                    self.DF[w] = {i}
            tokens = self.processed_title[i]
            for w in tokens:
                try:
                    self.DF[w].add(i)
                except:
                    self.DF[w] = {i}
        for i in self.DF:
            self.DF[i] = len(self.DF[i])    

        self.total_vocab_size = len(self.DF)
        self.total_vocab = [x for x in self.DF]
        print(f'[Info] Vocabulary Size: {self.total_vocab_size} Samples: {np.array(self.total_vocab[:8])}')

    def doc_freq(self, word):
        c = 0
        try:
            c = self.DF[word]
        except:
            pass
        return c

    def calculate_tf_idf(self):
        doc = 0
        self.tf_idf = {}
        for i in range(self.N):
            tokens = self.processed_text[i]
            counter = Counter(tokens + self.processed_title[i])
            words_count = len(tokens + self.processed_title[i])
            for token in np.unique(tokens):
                tf = counter[token]/words_count
                df = self.doc_freq(token)
                idf = np.log((self.N+1)/(df+1))
                self.tf_idf[doc, token] = tf*idf
            doc += 1
    
    def calculate_tf_idf_title(self):
        doc = 0
        self.tf_idf_title = {}
        for i in range(self.N):
            tokens = self.processed_title[i]
            counter = Counter(tokens + self.processed_text[i])
            words_count = len(tokens + self.processed_text[i])
            for token in np.unique(tokens):
                tf = counter[token]/words_count
                df = self.doc_freq(token)
                idf = np.log((self.N+1)/(df+1))
                self.tf_idf_title[doc, token] = tf*idf
            doc += 1

    def cosine_sim(self, a, b):
        cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
        return cos_sim

    def gen_vector(self, tokens):
        Q = np.zeros((len(self.total_vocab)))
        counter = Counter(tokens)
        words_count = len(tokens)

        query_weights = {}
        
        for token in np.unique(tokens):
            tf = counter[token]/words_count
            df = self.doc_freq(token)
            idf = math.log((self.N+1)/(df+1))
            try:
                ind = self.total_vocab.index(token)
                Q[ind] = tf*idf
            except:
                pass
        return Q

    def classify(self, k, query):
        preprocessed_query = PreProcessor(query).execute()
        tokens = word_tokenize(str(preprocessed_query))

        d_cosines = []
        query_vector = self.gen_vector(tokens)

        for d in self.D:
            sim = self.cosine_sim(query_vector, d)
            d_cosines.append(sim)

        out = np.array(d_cosines).argsort()[-k:][::-1]
        
        result = []
        for o in out:
            result.append({self.topics[o], round(d_cosines[o]+0.0001, 4)})
        return result

    def save(self, path, name):
        file_path = f'{path}{name}.json'
        print(f'[Info] saving file at {file_path}')
        data = {}
        data["N"] = self.N
        data["Name"] = name
        data["topics"] = self.topics
        data["DF"] = self.DF
        data["total_vocab"] = self.total_vocab
        with open(file_path, 'w') as f:
            json.dump(data, f)
        print(f'[Info] saving vector...')
        np.save(path + name, self.D)
        print(f'[Info] files saved!')

# Science

In [None]:
science = Doc2Vec('/content/gdrive/MyDrive/ircb/datasets/base/Science/')
science.execute()
science.save('/content/gdrive/MyDrive/ircb/vectors/Base/', 'Science')

[Info] root: /content/gdrive/MyDrive/ircb/datasets/base/Science/
[Info] total topics: 6
[Info] alpha: 0.3 N: 6
[Info] Processed Titles: 6 Processed Texts: 6
[Info] Vocabulary Size: 952 Samples: ['newton' 'law' 'motion' 'relat' 'forc' 'act' 'bodi' 'first']
[Info] DocVector Shape: (6, 952)
[Info] saving file at /content/gdrive/MyDrive/ircb/vectors/Base/Science.json
[Info] saving vector...
[Info] files saved!


In [None]:
science.classify(3, "what is diode")

[{0.0786, 'semiconductor'},
 {0.0001, 'human digestive system'},
 {0.0001, 'atom'}]

In [None]:
science.classify(3, "newtons second law of motion")[0]

{0.499, 'laws of motion'}

## History

In [None]:
history = Doc2Vec('/content/gdrive/MyDrive/ircb/datasets/base/History/')
history.execute()
history.save('/content/gdrive/MyDrive/ircb/vectors/Base/', 'History')

[Info] root: /content/gdrive/MyDrive/ircb/datasets/base/History/
[Info] total topics: 10
[Info] alpha: 0.3 N: 10
[Info] Processed Titles: 10 Processed Texts: 10
[Info] Vocabulary Size: 2089 Samples: ['known' 'great' 'war' '—a' 'land' 'air' 'sea' 'conflict']
[Info] DocVector Shape: (10, 2089)
[Info] saving file at /content/gdrive/MyDrive/ircb/vectors/Base/History.json
[Info] saving vector...
[Info] files saved!


In [None]:
history.classify(3, "Who is shivaji maharaj")[0]

{0.1665, 'Maratha Empire'}

# SSC 

In [None]:
science = Doc2Vec('/content/gdrive/MyDrive/ircb/datasets/ssc/english/science/')
science.execute()
science.save('/content/gdrive/MyDrive/ircb/vectors/ssc/', 'ssc_science')

[Info] root: /content/gdrive/MyDrive/ircb/datasets/ssc/english/science/
[Info] total topics: 10
[Info] alpha: 0.3 N: 10
[Info] Processed Titles: 10 Processed Texts: 10
[Info] Vocabulary Size: 2949 Samples: ['asexu' 'reproduct' 'process' 'format' 'new' 'organ' 'speci' 'without']
[Info] DocVector Shape: (10, 2949)
[Info] saving file at /content/gdrive/MyDrive/ircb/vectors/ssc/ssc_science.json
[Info] saving vector...
[Info] files saved!


In [None]:
science.classify(3, "What is microbiology")

[{0.1289, 'Introduction to Microbiology'},
 {0.0001, 'Disaster Management'},
 {0.0001, 'Social Health'}]

In [None]:
history = Doc2Vec('/content/gdrive/MyDrive/ircb/datasets/ssc/english/history/')
history.execute()
history.save('/content/gdrive/MyDrive/ircb/vectors/ssc/', 'ssc_history')

[Info] root: /content/gdrive/MyDrive/ircb/datasets/ssc/english/history/
[Info] total topics: 9
[Info] alpha: 0.3 N: 9
[Info] Processed Titles: 9 Processed Texts: 9
[Info] Vocabulary Size: 2764 Samples: ['histor' 'research' 'write' 'studi' 'carri' 'object' 'understand'
 'chronolog']
[Info] DocVector Shape: (9, 2764)
[Info] saving file at /content/gdrive/MyDrive/ircb/vectors/ssc/ssc_history.json
[Info] saving vector...
[Info] files saved!
