In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
def textrank(document):
    # Tokenize the document into sentences and words
    sentences = sent_tokenize(document)
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    
    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = [[word for word in sentence if word not in stop_words] for sentence in words]
    
    # Create a dictionary to store word frequency
    frequency = defaultdict(int)
    for sentence in words:
        for word in sentence:
            frequency[word] += 1
            
    # Calculate the word scores using Term Frequency-Inverse Document Frequency (TF-IDF)
    tfidf = TfidfVectorizer().fit_transform(sentences)
    cosine_similarities = cosine_similarity(tfidf)
    
    # Create a dictionary to store sentence scores
    scores = defaultdict(int)
    for i, sentence in enumerate(sentences):
        for j in range(len(sentences)):
            if i != j:
                scores[i] += cosine_similarities[i][j]
    
    # Normalize the scores
    max_score = max(scores.values())
    for sentence_index in scores:
        scores[sentence_index] /= max_score
        
    # Sort the sentences by score in descending order
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the most important sentence
    return sentences[sorted_scores[0][0]]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

# Example usage


document = """
She ran to her desk and pulled out a journal, jotting down her latest symptom.
If she didn't have a place she felt was safe enough, maybe she wrote her journal in code.
Franke in two articles in the Journal of the Pali Text Society for 1903, and in his Geschichte and Kritik der einheimischen Pali Grammatik.
Miss Annie's pretty much spelled it out in black and white in this here journal, even if it was in code.
Every day, I write in a journal.
Between Cynthia's and Donnie's efforts, only a few pages of the journal remained undeciphered.
He glanced up at her, closing the journal.
She closed the journal and began her preparation.
His valuable notes on Indian dialects are in The Transactions of the American Philosophical Society (1862), in The American Journal of Science (1862) and in The Proceedings of the American Philosophical Society (1869).
The Annie of Dean's dreams had long blonde hair but kept her head turned from him as she wrote in her journal.
"""
print(textrank(document))

She closed the journal and began her preparation.


In [None]:
import nltk 
nltk.download('gutenberg')
nltk.download('punkt')
gb = nltk.corpus.gutenberg

for fileid in gb.fileids():
  print(fileid)
# print("Gutenberg files:n", gb.fileids())
sentences = gb.sents('austen-emma.txt')

print(len(sentences))



[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


austen-emma.txt
austen-persuasion.txt
austen-sense.txt
bible-kjv.txt
blake-poems.txt
bryant-stories.txt
burgess-busterbrown.txt
carroll-alice.txt
chesterton-ball.txt
chesterton-brown.txt
chesterton-thursday.txt
edgeworth-parents.txt
melville-moby_dick.txt
milton-paradise.txt
shakespeare-caesar.txt
shakespeare-hamlet.txt
shakespeare-macbeth.txt
whitman-leaves.txt
7752


In [None]:
rawtext = gb.raw('austen-emma.txt')
processedtext = rawtext.replace('\r','').replace('\n',' ').replace('."','.').replace('Mr.','Mr').replace('Mrs.','Mr').replace('"  ','.').replace('"','').replace('W.', 'W').replace('_', '')
sentencesarray = processedtext.split('.')
sentencesarray = [s.strip() for s in sentencesarray]
import re
# sentencesarray = re.split(r'\.|\s{2}', sentences)
# sentencesarray = sentences.split('   ')

In [None]:
for i in sentencesarray[300:600]:
  print(i)

In [None]:
!pip install pysolr
import pysolr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pysolr
  Downloading pysolr-3.9.0.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pysolr
  Building wheel for pysolr (setup.py) ... [?25l[?25hdone
  Created wheel for pysolr: filename=pysolr-3.9.0-py2.py3-none-any.whl size=19691 sha256=1e05899457494e2bc31926f6e84fc9a900611e3f39baa658f6b1889c0552ab69
  Stored in directory: /root/.cache/pip/wheels/8f/e3/bb/f4c2a751045824a821ab25757e9762a66da88028d8f5f136ce
Successfully built pysolr
Installing collected packages: pysolr
Successfully installed pysolr-3.9.0


In [None]:
solr = pysolr.Solr('http://35.223.110.79:8983/solr/mycol1/', always_commit=True)

solr.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":5,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all",\n      "rid":"-60"}},\n  "status":"OK"}\n'

In [None]:
import time
for index, value in enumerate(sentencesarray):
  if len(value)<10:
    continue
  solr.add({
        "text":value,
        "source":"Emma by Jane Austen"
  })
  if index%150==0:
    time.sleep(5)
    

In [None]:
text_file = open("burgess-final2.json", "w")
n = text_file.write('['+string+']')
text_file.close()

In [None]:
'-------------------------------------------------------------------------------'

'-------------------------------------------------------------------------------'

In [None]:
import zipfile

zip_file = zipfile.ZipFile('/content/660_webhose-2015-10-new_20170904095249.zip', 'r')
zip_file.extractall('innerzip')
zip_file.close()


In [None]:
import os
import json
import ast
files = os.listdir('innerzip')
print(files[:100])




['news_0010192.json', 'news_0084483.json', 'news_0081358.json', 'news_0076663.json', 'news_0012820.json', 'news_0005291.json', 'news_0055289.json', 'news_0080127.json', 'news_0030281.json', 'news_0031555.json', 'news_0042797.json', 'news_0030051.json', 'news_0023055.json', 'news_0063291.json', 'news_0077453.json', 'news_0057331.json', 'news_0077690.json', 'news_0027988.json', 'news_0027610.json', 'news_0084802.json', 'news_0024973.json', 'news_0001222.json', 'news_0011470.json', 'news_0067856.json', 'news_0047708.json', 'news_0018662.json', 'news_0059279.json', 'news_0002492.json', 'news_0016072.json', 'news_0012520.json', 'news_0079669.json', 'news_0074103.json', 'news_0081674.json', 'news_0078577.json', 'news_0072855.json', 'news_0051449.json', 'news_0004950.json', 'news_0074029.json', 'news_0084144.json', 'news_0053258.json', 'news_0081998.json', 'news_0031588.json', 'news_0020871.json', 'news_0069898.json', 'news_0006208.json', 'news_0051042.json', 'news_0060520.json', 'news_005712

In [None]:
solr.delete(q='*:*')

'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n\n<lst name="responseHeader">\n  <int name="status">0</int>\n  <int name="QTime">11</int>\n</lst>\n</response>\n'

In [None]:
!pip install sentsplit
from sentsplit.segment import SentSplit
sent_splitter = SentSplit('en')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


2023-03-27 17:27:49.956 | INFO     | sentsplit.segment:__init__:47 - SentSplit for EN loaded:
{ 'handle_multiple_spaces': True,
  'maxcut': 500,
  'mincut': 7,
  'model': 'crf_models/en-default-25032021.model',
  'ngram': 5,
  'prevent_regexes': [ { 'name': 'liberal_url',
                         'regex': '\\b((?:[a-z][\\w\\-]+:(?:\\/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}\\/)(?:[^\\s()<>]|\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\))+(?:\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\\\'".,<>?«»“”‘’]))'},
                       { 'name': 'period_followed_by_lowercase',
                         'regex': '\\.(?= *[a-z])'}],
  'prevent_word_split': True,
  'segment_regexes': [ {'at': 'end', 'name': 'after_semicolon', 'regex': ' *;'},
                       { 'at': 'end',
                         'name': 'ellipsis',
                         'regex': '…(?![\\!\\?\\.．？！])'},
                       {'at': 'end', 'name': 'newline', 'regex': '\\n'}],
  'strip_

In [None]:
folder_path = 'innerzip'
import os
import json

# Loop over each file in the folder
for index, filename in enumerate(sorted(os.listdir(folder_path))[60001:]):
    
    # Check if the file is a JSON file
    if filename.endswith('.json'):
        # Construct the full path to the file
        filepath = os.path.join(folder_path, filename)
        # Open the file for reading
        with open(filepath, 'r') as f:
            # print(filepath)
            # Load the JSON data from the file
            data = json.load(f)
            
            # print(data)
            # Perform operations on the data here
            # ...
            # print(json.dumps(data, indent=4))
            source = data['thread']['site']
            rawtext = data['text']
            # print(source)

            # proc_text = (rawtext.replace("\\","").replace('\n','   ').replace('"',"'").replace('D.C.','DC')
            # .replace('U.S.','US').replace('Mr.','Mr').replace('Mrs.','Mrs').replace('.com','com')
            # .replace('Jan.','Jan').replace('Feb.','Feb').replace('Mar.','Mar').replace('Apr.','Apr').replace('May.','May').replace('Jun.','Jun')
            # .replace('Jul.','Jul').replace('Aug.','Aug').replace('Sep.','Sep').replace('Oct.','Oct').replace('Nov.','Nov').replace('Dec.','Dec')
            # .replace('Rep.','Rep').replace('Sen.','Sen').replace('Dem.','Dem').replace('p.m.','pm').replace('P.M.','PM').replace('a.m.','am')
            # .replace('A.M.','AM').replace('www.','www'))
            proc_sentences = sent_splitter.segment(rawtext.replace("\\","").replace('\n','   '))
            # print(proc_text)
            # proc_sentences = proc_text.split('.')
            
            for sent in proc_sentences:
              if len(sent)<1:
                continue
              tosolr.append({'text':sent.strip(), 'source':source, 'processed':'27Mar'})
              # print(tosolr)
            if(len(tosolr)>1000):
              solr.add(tosolr)
              tosolr=[]
            # print(tosolr)
            # print(source)
            if index%500==0:
              print('Processed '+str(index)+' files')
            
solr.add(tosolr)

Processed 0 files
Processed 500 files
Processed 1000 files
Processed 1500 files
Processed 2000 files
Processed 2500 files
Processed 3000 files
Processed 3500 files
Processed 4000 files
Processed 4500 files
Processed 5000 files
Processed 5500 files
Processed 6000 files
Processed 6500 files
Processed 7000 files
Processed 7500 files
Processed 8000 files
Processed 8500 files
Processed 9000 files
Processed 9500 files
Processed 10000 files
Processed 10500 files
Processed 11000 files
Processed 11500 files
Processed 12000 files
Processed 12500 files
Processed 13000 files
Processed 13500 files
Processed 14000 files
Processed 14500 files
Processed 15000 files
Processed 15500 files
Processed 16000 files
Processed 16500 files
Processed 17000 files
Processed 17500 files
Processed 18000 files
Processed 18500 files
Processed 19000 files
Processed 19500 files
Processed 20000 files
Processed 20500 files
Processed 21000 files
Processed 21500 files
Processed 22000 files
Processed 22500 files
Processed 23

'{\n  "responseHeader":{\n    "status":0,\n    "QTime":169}}\n'

In [None]:
source = (data['thread']['site'])
fulltext = data['text']
import re
texts = (re.split('\n|\.', fulltext.replace('Mr.','Mr').replace('U.S.','US').replace('Sen.', 'Sen').replace('i.e.', 'ie')))
``

 

['Watching the conventional media’s coverage of the 2016 presidential race is disheartening, to put it lightly', ' The personality politics, appealing to the emotions of voters rather than their reason and intellect, tell a story all too familiar for the United States', ' ', 'Take for example the Republican Party’s frontrunners Donald Trump and Jeb Bush', ' ', 'One could think of a plethora of reasons to criticize the former governor of Florida: his undying support for big banks that precipitated the collapse of the economy; his avowed commitment to the construction of more prisons and harsher sentences to non-violent offenders that cost taxpayers more money and perpetuate recidivism; his plans to defund programs and organizations that are often the only places women can turn for healthcare and reproductive services', ' ', 'Yet, rather than offering any criticism of substance, Mr Trump invoked Mr Bush’s "low-energy" in order to get ahead in the polls, a telling signal that many people 

In [None]:
print(len(sentences))

15


In [None]:
for i in sentences:
  print(i.strip())

Treasury Secretary Jack Lew warned Congress on Thursday that it must raise the debt ceiling by Nov 3 in order to guarantee that the government will be able to avoid a default on the debt.
Lew's letter makes the debt ceiling a more urgent matter than it already was for Congress.
Previously, Lew had warned that he would exhaust the 'extraordinary measures' he's using to make payments under the $18.1 trillion debt limit on Nov 5.
Now, he warned, he expects to be left with no resources except for $30 billion in cash after Nov 3, less than three weeks away.
Related Story: http://www.washingtonexaminercom/article/2574118/.
Lew's message will increase the pressure on outgoing House Speaker John Boehner, R-Ohio, to raise the debt ceiling before his planned retirement at the end of October.
Republicans have yet to select a new speaker after Majority Leader Kevin McCarthy, R-Calif., bowed out of the race, which means Republicans may be hamstrung in dealing with the debt ceiling until they find a