## Preprocessing Model

In [8]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import RegexpTokenizer
import re
from nltk.stem import PorterStemmer

def preprocess_alg(text):
    
    #lowercase
    text = text.lower()  
    
    #tokenization & remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')  
    tokens = tokenizer.tokenize(text)
    
    #remove stopword
    df_stopword=pd.read_excel("stopword.xlsx")
    stopword_list=df_stopword.to_string()
    filtered_text = [w for w in tokens if not w in stopword_list] 
    
    #stemming
    ps = PorterStemmer()
    stem_text = [ps.stem(token) for token in filtered_text]
    
    return " ".join(stem_text)

def preprocess_call(data):
    
    textword={} # datatype: dictionary
    for i in list(range(data.shape[0])):
        textword[i]=preprocess_alg(data.loc[i,'Text'])
    
    se_preprocessing = pd.Series(textword)  # convert dic to series
    df_preprocessing = se_preprocessing.to_frame(name='Text_tok') # convert series to df

    #join back query id based on index
    df_result_pre=pd.merge(data, df_preprocessing, left_index=True, right_index=True).drop(['Text'],axis=1)
    return df_result_pre


## Implementation

In [5]:
# query
df_query = pd.read_excel("nfdump.xlsx")

Unnamed: 0,Query id,Text
0,PLAIN-1,Why Deep Fried Foods May Cause Cancer In the l...
1,PLAIN-2,Do Cholesterol Statin Drugs Cause Breast Cance...
2,PLAIN-3,Breast Cancer Cells Feed on Cholesterol One in...
3,PLAIN-4,Using Diet to Treat Asthma and Eczema I previo...
4,PLAIN-5,Treating Asthma With Plants vs. Pills In my vi...


In [9]:
df_query_pre=preprocess_call(df_query)
df_query_pre.head()

Unnamed: 0,Query id,Text_tok
0,PLAIN-1,deep fri food cancer latest studi dietari patt...
1,PLAIN-2,cholesterol statin drug breast cancer breast c...
2,PLAIN-3,breast cancer cell feed cholesterol american w...
3,PLAIN-4,diet treat asthma eczema previous discuss powe...
4,PLAIN-5,treat asthma plant pill video treat asthma fru...


In [10]:
# document
df_doc = pd.read_excel("doc_dump.xlsx")
df_doc_pre=preprocess_call(df_doc)
df_doc_pre

Unnamed: 0,Doc id,Text_tok
0,MED-1,birth weight head circumfer prenat exposur acr...
1,MED-2,statist regress model estim acrylamid concentr...
2,MED-3,chronic intak potato chip human increas produc...
3,MED-4,dietari pattern breast cancer risk women pubm ...
4,MED-5,empir deriv dietari pattern risk postmenopaus ...
5,MED-6,consumpt deep fri food risk prostat cancera ab...
6,MED-7,heterocycl amin mutagen carcinogen produc cook...
7,MED-8,acrylamid food review scienc futur consider pu...
8,MED-9,elev level cholesterol rich lipid raft cancer ...
9,MED-10,statin breast cancer surviv nationwid cohort s...


## Preprocess result export

In [11]:
df_query_pre.to_excel('df_query_pre.xlsx')
df_doc_pre.to_excel('df_doc_pre.xlsx')