<a href="https://colab.research.google.com/github/somilasthana/MachineLearningSkills/blob/master/NLP_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
text = "She looked at   her father's arm-chair."
text_fr = "Qu'est-ce que c'est?"

In [3]:
CountVectorizer().build_tokenizer()(text)

['She', 'looked', 'at', 'her', 'father', 'arm', 'chair']

In [4]:
CountVectorizer().build_tokenizer()(text_fr)

['Qu', 'est', 'ce', 'que', 'est']

In [10]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
word_tokenize(text)

['She', 'looked', 'at', 'her', 'father', "'s", 'arm-chair', '.']

In [12]:
word_tokenize(text_fr)

["Qu'est-ce", 'que', "c'est", '?']

In [0]:
from nltk.tokenize import WordPunctTokenizer

In [17]:
WordPunctTokenizer().tokenize(text)

['She', 'looked', 'at', 'her', 'father', "'", 's', 'arm', '-', 'chair', '.']

In [0]:
import string

In [20]:
[s for s in text.split(' ') if s not in string.punctuation]

['She', 'looked', 'at', 'her', "father's", 'arm-chair.']

In [0]:
Stemming

In [0]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

In [23]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [25]:
[wnl.lemmatize(t) for t in text.split(' ')]

['She', 'looked', 'at', '', '', 'her', "father's", 'arm-chair.']

In [26]:
!wget https://liferay.de.dariah.eu/tatom/_downloads/datasets.zip

--2019-06-23 11:28:10--  https://liferay.de.dariah.eu/tatom/_downloads/datasets.zip
Resolving liferay.de.dariah.eu (liferay.de.dariah.eu)... 134.76.30.131
Connecting to liferay.de.dariah.eu (liferay.de.dariah.eu)|134.76.30.131|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57659024 (55M) [application/zip]
Saving to: ‘datasets.zip’


2019-06-23 11:28:13 (20.5 MB/s) - ‘datasets.zip’ saved [57659024/57659024]



In [0]:
!unzip /content/datasets.zip

Chunk

Observing two words together in a paragraph-sized chunk of text tells us much more about the relationship between those two words than observing two words occurring together in an 100,000 word book. 

In [0]:
corpus_path = "/content/data/french-tragedy"

In [31]:
os.listdir(path=corpus_path)[0:5]

['Voltaire_TR-V-1732-Zaire.txt',
 'Crebillon_TR-V-1749-Catilina.txt',
 'Voltaire_TR-V-1774-Sophonisbee.txt',
 'Crebillon_TR-V-1726-Pyrrhus.txt',
 'Racine_TR-V-1674-Iphigenie.txt']

In [0]:
tragedy_filenames = [os.path.join(corpus_path, fn) for fn in sorted(os.listdir(corpus_path))]

In [0]:
import glob
tragedy_filenames = glob.glob(corpus_path + os.sep + '*.txt')

In [36]:
tragedy_filenames[:5]

['/content/data/french-tragedy/Voltaire_TR-V-1732-Zaire.txt',
 '/content/data/french-tragedy/Crebillon_TR-V-1749-Catilina.txt',
 '/content/data/french-tragedy/Voltaire_TR-V-1774-Sophonisbee.txt',
 '/content/data/french-tragedy/Crebillon_TR-V-1726-Pyrrhus.txt',
 '/content/data/french-tragedy/Racine_TR-V-1674-Iphigenie.txt']

In [0]:
def split_text(filename, n_words):
  
  fp = open(filename, 'r')
  words = fp.read().split(' ')
  fp.close()
  
  chunks = []
  current_chunks = []
  
  for w in words:
    current_chunks.append(w)
    if len(current_chunks) == n_words:
      chunks.append(" ".join(current_chunks))
      current_chunks = []
      
  if len(current_chunks) != 0:
    chunks.append(" ".join(current_chunks))
    
  return chunks

In [0]:
chunk_length = 1000
chunks = []
for filename in tragedy_filenames:
  chunk_counter = 0
  texts = split_text(filename, chunk_length)
  for text in texts:
    chunk = {'text': text, 'number': chunk_counter, 'filename': filename}
    chunks.append(chunk)
    chunk_counter +=1 

In [45]:
len(chunks)

2740

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(min_df=5, max_df=0.95)

In [0]:
dtm = vectorizer.fit_transform([c['text'] for c in chunks])

In [0]:
vocab = np.array(vectorizer.get_feature_names())

In [51]:
vocab[10:20]

array(['abandonner', 'abandonnerez', 'abandonnez', 'abandonné',
       'abandonnée', 'abat', 'abattre', 'abattu', 'abattue', 'abattus'],
      dtype='<U15')

In [53]:
pd.DataFrame(dtm).head()

Unnamed: 0,0
0,"(0, 6619)\t2\n (0, 6129)\t1\n (0, 6552)\t1..."
1,"(0, 6619)\t1\n (0, 3850)\t2\n (0, 4244)\t2..."
2,"(0, 3850)\t3\n (0, 4244)\t1\n (0, 677)\t1\..."
3,"(0, 3850)\t1\n (0, 5584)\t1\n (0, 956)\t1\..."
4,"(0, 6619)\t2\n (0, 3850)\t2\n (0, 677)\t2\..."
