<a href="https://colab.research.google.com/github/sreedeepack/Q-Exchange/blob/dev/IR%20workbench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

In [8]:
import pandas as pd
import nltk
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from math import log10

In [23]:
class Tokenizer(object):
    '''
    Removes stop words and returns tokenized words.
    '''
    def __init__(self):        
        
        nltk.download('stopwords', quiet=True, raise_on_error=True)
        nltk.download('punkt', quiet=True, raise_on_error=True)

        self.tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))
        self._stop_words = set(nltk.corpus.stopwords.words('english'))
        
        self.stemmer = nltk.stem.PorterStemmer()

    @staticmethod
    def clean_str(text) :
        text = (text.encode('ascii', 'ignore')).decode("utf-8")
        text = re.sub("&.*?;", "", text)
        text = re.sub(">", "", text)    
        text = re.sub("[\]\|\[\@\,\$\%\*\&\\\(\)\":]", "", text)
        text = re.sub("-", " ", text)
        text = re.sub("\.+", "", text)
        text = re.sub("^\s+","" ,text)
        text = text.lower()
        return text


        
    def stem(self, token):
        '''
        Apply stemming to tokens. Uses nltk PorterStemmer

        Parameters:
            token: String to stem
        
        Returns:
            Stemmed token
        '''
        if (token in self._stop_words):
            return token  
        return self.stemmer.stem(token)

    def tokenize(self, line):
        '''
        Call this to tokenize a string. Also does stemming.

        Parameters:
            line (string)
        
        Returns:
            list of stemmed tokens
        '''
        return self.__call__(line)

    def __call__(self, line):
        tokens = nltk.word_tokenize(line)
        tokens = (self.stem(token) for token in tokens)
        tokens = [token for token in tokens if token.isalnum()]
        return list(tokens)


In [13]:
%pip install jsonlines

Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0


In [33]:
import jsonlines

def foo(file = 'stack.jl'):
    tokenizer = Tokenizer()
    with jsonlines.open(file) as reader:
        for obj in reader:
            item = {}
            item['url'] = obj['url'] # use as id?
            item['title'] = Tokenizer.clean_str(obj['title'])
            item['title'] = tokenizer.tokenize(item['title'])
            item['desc'] = Tokenizer.clean_str(obj['desc'])
            item['desc'] = tokenizer.tokenize(item['desc'])

            yield item
                

In [34]:
for each in foo():
    print(each['title'])

['what', 'is', 'the', 'rational', 'for', 'all', 'comparison', 'return', 'fals', 'for', 'ieee754', 'nan', 'valu']
['how', 'can', 'i', 'deseri', 'json', 'to', 'a', 'simpl', 'dictionari', 'stringstr', 'in', 'aspnet']
['how', 'do', 'i', 'read', 'a', 'larg', 'csv', 'file', 'with', 'panda']
['all', 'falsey', 'valu', 'in', 'javascript']
['can', 'i', 'have', 'an', 'onclick', 'effect', 'in', 'css']
['get', 'record', 'with', 'max', 'valu', 'for', 'each', 'group', 'of', 'group', 'sql', 'result']
['what', 'are', 'the', 'use', 'of', 'use', 'in', 'c']
['how', 'to', 'find', 'the', 'statist', 'mode']
['why', 'does', 'python', 'use', 'after', 'for', 'and', 'while', 'loop']
['regular', 'express', 'to', 'match', 'dn', 'hostnam', 'or', 'ip', 'address']
['convers', 'to', 'dalvik', 'format', 'fail', 'with', 'error', '1', 'on', 'extern', 'jar']
['use', 'to', 'gener', 'and', 'save', 'a', 'file']
['why', 'would', 'a', 'javascript', 'variabl', 'start', 'with', 'a', 'dollar', 'sign', 'duplic']
['what', 'the', 'd

In [None]:
#TODO index construction (with position?)

# **Extracting data**

In [3]:
%shell git clone https://github.com/sreedeepack/Q-Exchange.git

Cloning into 'Q-Exchange'...
remote: Enumerating objects: 126, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 126 (delta 64), reused 100 (delta 43), pack-reused 0[K
Receiving objects: 100% (126/126), 19.81 KiB | 3.96 MiB/s, done.
Resolving deltas: 100% (64/64), done.




In [1]:
%pip install scrapy

Collecting scrapy
[?25l  Downloading https://files.pythonhosted.org/packages/fd/49/425c39549e277510d5cbc9f8f6ae36e1d4faf9fd768b613af1dd6b786c60/Scrapy-2.4.0-py2.py3-none-any.whl (239kB)
[K     |█▍                              | 10kB 15.5MB/s eta 0:00:01[K     |██▊                             | 20kB 1.5MB/s eta 0:00:01[K     |████                            | 30kB 2.0MB/s eta 0:00:01[K     |█████▌                          | 40kB 1.6MB/s eta 0:00:01[K     |██████▉                         | 51kB 1.9MB/s eta 0:00:01[K     |████████▏                       | 61kB 2.1MB/s eta 0:00:01[K     |█████████▋                      | 71kB 2.4MB/s eta 0:00:01[K     |███████████                     | 81kB 2.6MB/s eta 0:00:01[K     |████████████▎                   | 92kB 2.8MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 2.7MB/s eta 0:00:01[K     |███████████████                 | 112kB 2.7MB/s eta 0:00:01[K     |████████████████▍               | 122kB 2.7MB/s eta 

In [4]:
cd Q-Exchange/crawlers

/content/Q-Exchange/crawlers


In [7]:
%shell scrapy crawl stack

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 'src': 'https://chemistry.stackexchange.com/',
 'tags': ['electronic-configuration', 'elements', 'protons'],
 'title': 'Will adding up protons and electrons (without neutrons) create a '
          'new element?',
 'url': 'https://chemistry.stackexchange.com/questions/37486/will-adding-up-protons-and-electrons-without-neutrons-create-a-new-element',
 'votes': '17'}
2020-10-16 18:19:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://chemistry.stackexchange.com/questions/297/what-dictates-the-lifetime-of-a-solvated-electron-in-a-given-solvent> (referer: https://chemistry.stackexchange.com/questions?pagesize=50&sort=frequent%20+%20&page=42)
2020-10-16 18:19:18 [scrapy.core.scraper] DEBUG: Scraped from <200 https://chemistry.stackexchange.com/questions/297/what-dictates-the-lifetime-of-a-solvated-electron-in-a-given-solvent>
{'answers': '1',
 'date': '2012-05-08 09:15:16Z',
 'desc': ' have a long lifetime in ammonia so

