In [11]:
import nltk
import string
import json
import os
import pprint
import pysolr
import requests
import pandas as pd
import pickle
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing text

In [12]:
def pre_processing(paragraph):
    text = paragraph.replace("'", "")
    text = text.replace('“',"")
    text = text.replace('”',"")
    text = text.replace('"',"")
    text = text.replace('_',"")
    text = text.replace('—',"")
    text = text.replace('-'," ")
    text = text.replace('\n',"")
    stop = set(nltk.corpus.stopwords.words('english')+ list(string.punctuation))
    filtered_words = [i.lower().strip() for i in nltk.word_tokenize(text) if i not in stop]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]   
    para = ' '.join(lemmatized_words)
    return para

## Getting the list of books in the directory

In [16]:
book_files = os.listdir("Books")
book_files.remove('.DS_Store')

## Creating a dictionary with keys as title and author and values as paragraphs in the novels

In [17]:
book_para_dict = {}
content_text = ['\nContents','Contents','CONTENTS']
for book in book_files:
    with open(f'Books/{book}', 'r') as file:
        book_split = file.read().split('\n\n')
        title_line = next((line for line in book_split if line.startswith("Title:")), None)
        author_line = next((line for line in book_split if line.startswith("\nAuthor:")), None)
        name = title_line.split("Title:")[1].strip()
        author = author_line.split("\nAuthor:")[1].strip()
        book_split_clean = [pre_processing(ele) for ele in book_split] 
        start_line = next((line for line in book_split_clean if line.startswith("start of the project gutenberg ebook")), None)
        start_index = book_split_clean.index(start_line)
        end_line = next((line for line in book_split_clean if line.startswith("end of the project gutenberg ebook")), None)
        end_index = book_split_clean.index(end_line)
        paragraphs = [ele for ele in book_split_clean[start_index+1:end_index] if ele != ""]
    book_para_dict[(name,author)] = paragraphs


## Converting the dictionary to a dataframe

In [35]:
data = []
for (name, author), paragraphs in book_para_dict.items():
    combined_para = []
    for i in range(0, len(paragraphs),10):
        combined_text = ' '.join(paragraphs[i:i+10])
        combined_para.append(combined_text)
    for paragraph in combined_para:
        data.append({'title': name, 'author': author, 'paragraph': paragraph})

books_df = pd.DataFrame(data)
books_df['para_id'] = books_df.index
books_df.head()

Unnamed: 0,title,author,paragraph,para_id
0,The Adventures of Sherlock Holmes,Arthur Conan Doyle,the adventure sherlock holmes arthur conan doy...,0
1,The Adventures of Sherlock Holmes,Arthur Conan Doyle,wedlock suit remarked i think watson puton sev...,1
2,The Adventures of Sherlock Holmes,Arthur Conan Doyle,frequently how often well hundred time then ma...,2
3,The Adventures of Sherlock Holmes,Arthur Conan Doyle,i carefully examined writing paper upon waswri...,3
4,The Adventures of Sherlock Holmes,Arthur Conan Doyle,a pair sound said yes continued glancing ofthe...,4


In [36]:
print(books_df['title'].unique())

['The Adventures of Sherlock Holmes' 'Romeo and Juliet' 'The Iliad'
 "Gulliver's Travels into Several Remote Nations of the World"
 'Moby Dick; Or, The Whale' 'Against the Grain' 'Babbitt' 'Dracula'
 'The Pilgrim Fathers of New England: a history' 'The Alchemist'
 'Adventures of Huckleberry Finn' 'Hervey Willetts' 'The dark night'
 "Janet's boys"]


In [38]:
books_df.to_csv('novels_data.csv', index = False)

## SOLR indexing

In [48]:
CORE_NAME = "IRF23P3"
VM_IP = "34.125.172.59"

In [49]:
def delete_core(core=CORE_NAME):
    print('sudo su - solr -c "/opt/solr/bin/solr delete -c {core}"'.format(core=core))


def create_core(core=CORE_NAME):
    print('sudo su - solr -c "/opt/solr/bin/solr create -c {core} -n data_driven_schema_configs"'.format(
            core=core))

In [50]:
class Indexer:
    def __init__(self):
        self.solr_url = f'http://{VM_IP}:8983/solr/'
        self.connection = pysolr.Solr(self.solr_url + CORE_NAME, always_commit=True, timeout=5000000)

    def do_initial_setup(self):
        delete_core()
        create_core()

    def create_documents(self, docs):
        print(self.connection.add(docs))

    def add_fields(self):
        data = {
            "add-field": [
                {
                    "name": "title",
                    "type": "string",
                    "indexed": True,
                    "multiValued": False
                },
                {
                    "name": "author",
                    "type": "string",
                    "indexed": True,
                    "multiValued": False
                },
                {
                    "name": "paragraph",
                    "type": "text_en",
                    "indexed": True,
                    "multiValued": False
                },
                 {
                    "name": "para_id",
                    "type": "string",
                    "indexed": True,
                    "multiValued": False
                }
            ]
        }

        print(requests.post(self.solr_url + CORE_NAME + "/schema", json=data).json())



In [51]:
i = Indexer()
i.do_initial_setup()
# i.add_fields()

sudo su - solr -c "/opt/solr/bin/solr delete -c IRF23P3"
sudo su - solr -c "/opt/solr/bin/solr create -c IRF23P3 -n data_driven_schema_configs"


In [52]:
i.add_fields()

{'responseHeader': {'status': 0, 'QTime': 754}}


In [53]:
books_df = pd.read_csv('novels_data.csv')
collection = books_df.to_dict('records')
i.create_documents(collection)

{
  "responseHeader":{
    "status":0,
    "QTime":10676}}

