In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gensim
import pandas as pd
import json

In [None]:
website_text_df = pd.read_csv('/kaggle/input/hackrx-20-bajaj-fin-serv/paras-and-lines-website-scraped.csv')
website_text_df

# Training our Word2Vec Embeddings

We are training domain specific word embeddings using Gensim Library. 

Concept of word embeddings can be understood using this [Youtube Video by Codebasics](https://www.youtube.com/watch?v=hQwFeIupNP0&t=2s) 

We used [Jupyter Notebook](https://github.com/codebasics/deep-learning-keras-tf-tutorial/blob/master/42_word2vec_gensim/42_word2vec_gensim.ipynb) and [Youtube Video by Codebasics](https://www.youtube.com/watch?v=Q2NtCcqmIww&t=3s) to understand how to train our own word embeddings

In [None]:
#Preprocessing the text obtained from scraping to convert them to tokens
website_text = website_text_df['lines'].apply(gensim.utils.simple_preprocess)
website_text

In [None]:
#Initializing the model
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [None]:
#Building vocabulary for the model
model.build_vocab(website_text, progress_per=1000)

In [None]:
#Training the word2vec model
model.train(website_text, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
#model.save("")

In [None]:
#Checking model performance
model.wv.most_similar('loan')

# Improving the word embeddings model with more data from Twitter

We also scraped all the tweets from Bajaj Finserv Twitter Handle and improved our word embeddings model using those tweets. 

In [None]:
tweets_df = pd.read_csv('/kaggle/input/hackrx-20-bajaj-fin-serv/tweets-extracted-from-bajaj-finserv-twitter.csv')
tweets_df

In [None]:
#We used the tweet-preprocessor library to remove urls, hashtags, emojis from the extracted tweets
!pip3 install tweet-preprocessor

In [None]:
import preprocessor as p
import re
tweet_text_cleaned = tweets_df.Text.apply(p.clean)
tweet_text_cleaned = tweet_text_cleaned.apply(lambda x: re.sub(r"www\S+", "", x))
tweet_text_preprocessed = tweet_text_cleaned.apply(gensim.utils.simple_preprocess)
tweet_text_preprocessed

In [None]:
#Training the model on more data 

#model = gensim.models.Word2Vec.load('')
model.build_vocab(tweet_text_preprocessed, update=True)
model.train(tweet_text_preprocessed, total_examples=model.corpus_count, epochs=model.epochs)
#model.save('')

In [None]:
model.wv.most_similar('loan')

In [None]:
#Model seems to be performing pretty good 

model.wv.most_similar('insurance')

In [None]:
model.wv.most_similar('demat')

# Using the model to recommend relavant articles

We decided to use the headings of the articles that we smartly got from the url of the article to find out which articles should be suggested when a user searches for some keyword 

## Here comes the magic 

In [None]:
urls_df = pd.read_csv('/kaggle/input/hackrx-20-bajaj-fin-serv/webpage-urls-to-recommend-from.csv')
urls_df

We are checking if there's any word that is very similar to the query keyword using our word embedding model and if that's the case it is being recommended. 

We have kept the similarity score to be more than 0.3 between any word and the query word, for it to be recommended. 

We can easily extend this model for multi-word keyword searches as shown in [this](https://www.kaggle.com/umus123/recommending-urls-based-on-keyword-search) notebook 

In [None]:
query_keyword = 'debt'

for link in urls_df['links']:
  heading_in_url = link[37:]
  words_in_url = heading_in_url.split('-')

  for word in words_in_url:
    if word.lower() in model.wv.key_to_index:
      if model.wv.similarity(word.lower(), query_keyword) > 0.3:
        print(link[37:].replace('-', ' '))
        print()
        break
'''        
The model is able to recognize headings with words cibil score, loan, 
emi, credit score as relevant search results, which is pretty cool. 
'''  