In [147]:
import nltk
from nltk.corpus import stopwords 
import pandas as pd
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP

## Loading Dataset

In [148]:
dataset = pd.read_csv("biology.csv")
type(dataset)

pandas.core.frame.DataFrame

In [149]:
type(dataset['content'])

pandas.core.series.Series

## Extracting data from paragraph tag

In [150]:
content = dataset['content'].values.tolist()
paragraph_text= []
for para in content:
    soup = BeautifulSoup(para, 'html.parser')
    paragraph_text.extend([p.getText() for p in soup.find_all('p')])

In [152]:
for p in paragraph_text[5:10]:
    print(p, "\n")

Lymphocytes may be as small as 6–9 μm in diameter or as large as 10–14 μm in diameter. 

Those ranges are quite close to each others. Should the above be taken to mean that lymphocytes sizes are clustered in two groups, or is it just a way of saying that lymphocytes are 6-14 μm? 

Various people in our lab will prepare a liter or so of LB, add kanamycin to 25-37 mg/L for selection, and store it at 4 °C for minipreps or other small cultures (where dosing straight LB with a 1000X stock is troublesome).  Some think using it after more than a week is dubious, but we routinely use kan plates that are 1-2 months old with no ill effect. 

How long can LB with antibiotic such as kanamycin, chloramphenicol, or ampicillin be stored at 4 °C and maintain selection? 

Are there any cases in which the splicing machinery constructs an mRNA in which the exons are not in the 5' -> 3' genomic order? I'm interested any such cases, whether they involve constitutive or alternative splicing. 



## Removing unnecessary tags

In [153]:
for i in range(len(paragraph_text)):
    paragraph_text[i] = re.sub(r'<.*/?>', '', paragraph_text[i])

## Expanding Contractions

In [154]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

for i in range(len(paragraph_text)):
    paragraph_text[i] = expand_contractions(paragraph_text[i])

## Tokenization and punctuation removal

In [155]:
tokenizer = nltk.RegexpTokenizer(r"\w+")

for i in range(len(paragraph_text)):
    paragraph_text[i] = tokenizer.tokenize(paragraph_text[i])

print(paragraph_text[:5])

[['In', 'prokaryotic', 'translation', 'how', 'critical', 'for', 'efficient', 'translation', 'is', 'the', 'location', 'of', 'the', 'ribosome', 'binding', 'site', 'relative', 'to', 'the', 'start', 'codon'], ['Ideally', 'it', 'is', 'supposed', 'to', 'be', '7b', 'away', 'from', 'the', 'start', 'How', 'about', 'if', 'it', 'is', '9', 'bases', 'away', 'or', 'even', 'more', 'Will', 'this', 'have', 'an', 'observable', 'effect', 'on', 'translation'], ['Does', 'anyone', 'have', 'any', 'suggestions', 'to', 'prevent', 'RNAse', 'contamination', 'when', 'working', 'with', 'RNA'], ['I', 'tend', 'to', 'have', 'issues', 'with', 'degradation', 'regardless', 'of', 'whether', 'I', 'use', 'DEPC', 'treated', 'RNAse', 'free', 'water', 'and', 'filtered', 'pipette', 'tips'], ['Tortora', 'writes', 'in', 'Principles', 'of', 'Anatomy', 'and', 'Physiology']]


## Covert Text to lowercase

In [156]:
paragraph_text = [word.lower() for sentence in paragraph_text for word in sentence]
print(paragraph_text[:45])

['in', 'prokaryotic', 'translation', 'how', 'critical', 'for', 'efficient', 'translation', 'is', 'the', 'location', 'of', 'the', 'ribosome', 'binding', 'site', 'relative', 'to', 'the', 'start', 'codon', 'ideally', 'it', 'is', 'supposed', 'to', 'be', '7b', 'away', 'from', 'the', 'start', 'how', 'about', 'if', 'it', 'is', '9', 'bases', 'away', 'or', 'even', 'more', 'will', 'this']


In [157]:
len(paragraph_text)

1202364

## Removing Stop words

In [158]:
# nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
paragraph_text = [w for w in paragraph_text if not w in stop_words] 

In [161]:
for word in paragraph_text[:45]:
    print(word, end=" ")

prokaryotic translation critical efficient translation location ribosome binding site relative start codon ideally supposed 7b away start 9 bases away even observable effect translation anyone suggestions prevent rnase contamination working rna tend issues degradation regardless whether use depc treated rnase free water filtered pipette tips 

In [162]:
len(paragraph_text)

628485

### Hence we can see that length almost halved after removing stop words.

## Stemming

In [163]:
from nltk.stem import PorterStemmer
stemmed_words = [] 
stemmer = PorterStemmer()

for word in paragraph_text:
    stemmed_words.append(stemmer.stem(word))

## Lemmatization

In [164]:
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmetized_words = []
for word in paragraph_text:
    lemmetized_words.append(lemmatizer.lemmatize(word))

## Stemming vs Lemmatization

In [144]:
for i in range(628095, 628100):
    print(paragraph_text[i], " : ", stemmed_words[i], " (Stemmed) : ", lemmetized_words[i], " (Lemmatized)")

plot  :  plot  (Stemmed) :  plot  (Lemmatized)
power  :  power  (Stemmed) :  power  (Lemmatized)
frequencies  :  frequenc  (Stemmed) :  frequency  (Lemmatized)
get  :  get  (Stemmed) :  get  (Lemmatized)
following  :  follow  (Stemmed) :  following  (Lemmatized)


### It is obsered that sometimes Stemmer returns wrong root form of a word that isn't correct according to English Dictionary. While the Lemmatizer always returns the correct root form of a word.

### Example: for the word 'frequencies', Stemmer returns 'frequenc' while Lemmatizer returns 'frequency'.

### Stemmer is faster than Lemmatizer but less accurate. Choose which one to use according to need.