# Common NLP Preprocessing Techniques

### Import Libraries

In [5]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

### Download NLTK data,  if not already available

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fahan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fahan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Fahan\AppData\Roaming\nltk_data...


True

### Load the dataset

In [11]:
df = pd.read_csv('Constitution.csv')

In [12]:
df

Unnamed: 0,Country,Year Enacted,Scope,Length (in Words),Executive Power,Legislative Power,Judicial Independence,Number of Rights,Preamble
0,Afghanistan,2004,0.67,10227,6,0.38,2,37,"In the name of Allah, the Most Beneficent, the..."
1,Albania,1998,0.61,13826,5,0.43,5,77,"We, the people of Albania, proud and aware of ..."
2,Algeria,1996,0.61,10038,7,0.29,1,36,The Algerian people are a free people; and the...
3,Andorra,1993,0.51,8740,6,0.19,3,51,"The Andorran People, with full liberty and ind..."
4,Angola,2010,0.80,27181,7,0.19,2,80,"We, the people of Angola, through its lawful r..."
...,...,...,...,...,...,...,...,...,...
185,Vanuatu,1980,0.51,8425,6,0.29,4,28,"WE the people of Vanuatu, PROUD of our struggl..."
186,Venezuela,1999,0.66,37344,6,0.38,1,82,"The people of Venezuela, exercising their powe..."
187,Vietnam,1992,0.51,11344,2,0.43,0,45,"In the course of their millennia-old history, ..."
188,Zambia,1991,0.63,30696,4,0.10,4,50,"WE, THE PEOPLE OF ZAMBIA: ACKNOWLEDGE the supr..."


In [13]:
df.head()

Unnamed: 0,Country,Year Enacted,Scope,Length (in Words),Executive Power,Legislative Power,Judicial Independence,Number of Rights,Preamble
0,Afghanistan,2004,0.67,10227,6,0.38,2,37,"In the name of Allah, the Most Beneficent, the..."
1,Albania,1998,0.61,13826,5,0.43,5,77,"We, the people of Albania, proud and aware of ..."
2,Algeria,1996,0.61,10038,7,0.29,1,36,The Algerian people are a free people; and the...
3,Andorra,1993,0.51,8740,6,0.19,3,51,"The Andorran People, with full liberty and ind..."
4,Angola,2010,0.8,27181,7,0.19,2,80,"We, the people of Angola, through its lawful r..."


In [14]:
df['Country'] = df['Country'].astype(str)

### Lowercasing

In [16]:
df['Country'] = df['Country'].str.lower()

In [17]:
df

Unnamed: 0,Country,Year Enacted,Scope,Length (in Words),Executive Power,Legislative Power,Judicial Independence,Number of Rights,Preamble
0,afghanistan,2004,0.67,10227,6,0.38,2,37,"In the name of Allah, the Most Beneficent, the..."
1,albania,1998,0.61,13826,5,0.43,5,77,"We, the people of Albania, proud and aware of ..."
2,algeria,1996,0.61,10038,7,0.29,1,36,The Algerian people are a free people; and the...
3,andorra,1993,0.51,8740,6,0.19,3,51,"The Andorran People, with full liberty and ind..."
4,angola,2010,0.80,27181,7,0.19,2,80,"We, the people of Angola, through its lawful r..."
...,...,...,...,...,...,...,...,...,...
185,vanuatu,1980,0.51,8425,6,0.29,4,28,"WE the people of Vanuatu, PROUD of our struggl..."
186,venezuela,1999,0.66,37344,6,0.38,1,82,"The people of Venezuela, exercising their powe..."
187,vietnam,1992,0.51,11344,2,0.43,0,45,"In the course of their millennia-old history, ..."
188,zambia,1991,0.63,30696,4,0.10,4,50,"WE, THE PEOPLE OF ZAMBIA: ACKNOWLEDGE the supr..."


### Removing Punctuation

In [20]:
df['Preamble'] = df['Preamble'].str.replace(f"[{string.punctuation}]", "", regex=True)

In [21]:
df

Unnamed: 0,Country,Year Enacted,Scope,Length (in Words),Executive Power,Legislative Power,Judicial Independence,Number of Rights,Preamble
0,afghanistan,2004,0.67,10227,6,0.38,2,37,In the name of Allah the Most Beneficent the M...
1,albania,1998,0.61,13826,5,0.43,5,77,We the people of Albania proud and aware of ou...
2,algeria,1996,0.61,10038,7,0.29,1,36,The Algerian people are a free people and they...
3,andorra,1993,0.51,8740,6,0.19,3,51,The Andorran People with full liberty and inde...
4,angola,2010,0.80,27181,7,0.19,2,80,We the people of Angola through its lawful rep...
...,...,...,...,...,...,...,...,...,...
185,vanuatu,1980,0.51,8425,6,0.29,4,28,WE the people of Vanuatu PROUD of our struggle...
186,venezuela,1999,0.66,37344,6,0.38,1,82,The people of Venezuela exercising their power...
187,vietnam,1992,0.51,11344,2,0.43,0,45,In the course of their millenniaold history th...
188,zambia,1991,0.63,30696,4,0.10,4,50,WE THE PEOPLE OF ZAMBIA ACKNOWLEDGE the suprem...


### Tokenization

##### Convert the 'Preamble' column to strings

In [25]:
df['Preamble'] = df['Preamble'].astype(str)

##### Handle missing values if they exist

In [26]:
df['Preamble'] = df['Preamble'].fillna('')

In [27]:
df['Tokenized_Preamble'] = df['Preamble'].apply(word_tokenize)

In [28]:
df

Unnamed: 0,Country,Year Enacted,Scope,Length (in Words),Executive Power,Legislative Power,Judicial Independence,Number of Rights,Preamble,Tokenized_Preamble
0,afghanistan,2004,0.67,10227,6,0.38,2,37,In the name of Allah the Most Beneficent the M...,"[In, the, name, of, Allah, the, Most, Benefice..."
1,albania,1998,0.61,13826,5,0.43,5,77,We the people of Albania proud and aware of ou...,"[We, the, people, of, Albania, proud, and, awa..."
2,algeria,1996,0.61,10038,7,0.29,1,36,The Algerian people are a free people and they...,"[The, Algerian, people, are, a, free, people, ..."
3,andorra,1993,0.51,8740,6,0.19,3,51,The Andorran People with full liberty and inde...,"[The, Andorran, People, with, full, liberty, a..."
4,angola,2010,0.80,27181,7,0.19,2,80,We the people of Angola through its lawful rep...,"[We, the, people, of, Angola, through, its, la..."
...,...,...,...,...,...,...,...,...,...,...
185,vanuatu,1980,0.51,8425,6,0.29,4,28,WE the people of Vanuatu PROUD of our struggle...,"[WE, the, people, of, Vanuatu, PROUD, of, our,..."
186,venezuela,1999,0.66,37344,6,0.38,1,82,The people of Venezuela exercising their power...,"[The, people, of, Venezuela, exercising, their..."
187,vietnam,1992,0.51,11344,2,0.43,0,45,In the course of their millenniaold history th...,"[In, the, course, of, their, millenniaold, his..."
188,zambia,1991,0.63,30696,4,0.10,4,50,WE THE PEOPLE OF ZAMBIA ACKNOWLEDGE the suprem...,"[WE, THE, PEOPLE, OF, ZAMBIA, ACKNOWLEDGE, the..."


##### The length of each tokenized list

In [29]:
df['Length (in Words)'] = df['Tokenized_Preamble'].apply(len)

In [32]:
df[['Preamble', 'Tokenized_Preamble', 'Length (in Words)']].head()

Unnamed: 0,Preamble,Tokenized_Preamble,Length (in Words)
0,In the name of Allah the Most Beneficent the M...,"[In, the, name, of, Allah, the, Most, Benefice...",255
1,We the people of Albania proud and aware of ou...,"[We, the, people, of, Albania, proud, and, awa...",114
2,The Algerian people are a free people and they...,"[The, Algerian, people, are, a, free, people, ...",1338
3,The Andorran People with full liberty and inde...,"[The, Andorran, People, with, full, liberty, a...",255
4,We the people of Angola through its lawful rep...,"[We, the, people, of, Angola, through, its, la...",582


### Removing Stop Words

In [33]:
stop_words = set(stopwords.words('english'))

In [37]:
df['text_no_stopwords'] = df['Tokenized_Preamble'].apply(lambda x: [word for word in x if word not in stop_words])

In [38]:
df['text_no_stopwords'].head()

0    [In, name, Allah, Most, Beneficent, Most, Merc...
1    [We, people, Albania, proud, aware, history, r...
2    [The, Algerian, people, free, people, resolved...
3    [The, Andorran, People, full, liberty, indepen...
4    [We, people, Angola, lawful, representatives, ...
Name: text_no_stopwords, dtype: object

### Stemming

In [39]:
stemmer = PorterStemmer()

In [40]:
df['text_stemmed'] = df['text_no_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

In [41]:
df['text_stemmed'].head()

0    [in, name, allah, most, benefic, most, merci, ...
1    [we, peopl, albania, proud, awar, histori, res...
2    [the, algerian, peopl, free, peopl, resolv, re...
3    [the, andorran, peopl, full, liberti, independ...
4    [we, peopl, angola, law, repres, legisl, natio...
Name: text_stemmed, dtype: object

### Lemmatization

In [42]:
lemmatizer = WordNetLemmatizer()

In [43]:
df['text_lemmatized'] = df['text_no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [44]:
df['text_lemmatized'].head()

0    [In, name, Allah, Most, Beneficent, Most, Merc...
1    [We, people, Albania, proud, aware, history, r...
2    [The, Algerian, people, free, people, resolved...
3    [The, Andorran, People, full, liberty, indepen...
4    [We, people, Angola, lawful, representative, l...
Name: text_lemmatized, dtype: object

### Removing Numbers

In [47]:
df['text_no_numbers'] = df['text_lemmatized'].apply(lambda x: [re.sub(r'\d+', '', word) for word in x])

In [48]:
df['text_no_numbers'].head()

0    [In, name, Allah, Most, Beneficent, Most, Merc...
1    [We, people, Albania, proud, aware, history, r...
2    [The, Algerian, people, free, people, resolved...
3    [The, Andorran, People, full, liberty, indepen...
4    [We, people, Angola, lawful, representative, l...
Name: text_no_numbers, dtype: object

### Removing URLs and Special Characters

In [58]:
df['text_clean'] = df['text_no_numbers'].apply(lambda x: [re.sub(r'http\S+|www\S+|https\S+', '', word, flags=re.MULTILINE) for word in x])
df['text_clean'] = df['Preamble'].apply(lambda x: [re.sub(r'\@\w+|\#', '', word) for word in x])

In [59]:
df['text_clean'].head()

0    [I, n,  , t, h, e,  , n, a, m, e,  , o, f,  , ...
1    [W, e,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
2    [T, h, e,  , A, l, g, e, r, i, a, n,  , p, e, ...
3    [T, h, e,  , A, n, d, o, r, r, a, n,  , P, e, ...
4    [W, e,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
Name: text_clean, dtype: object

### Handling Emojis

In [68]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [69]:
df['text_clean'] = df['Preamble'].apply(lambda x: [remove_emojis(word) for word in x])

In [71]:
df['text_clean'].head()

0    [I, n,  , t, h, e,  , n, a, m, e,  , o, f,  , ...
1    [W, e,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
2    [T, h, e,  , A, l, g, e, r, i, a, n,  , p, e, ...
3    [T, h, e,  , A, n, d, o, r, r, a, n,  , P, e, ...
4    [W, e,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
Name: text_clean, dtype: object

### Handling Rare Words

In [72]:
word_freq = pd.Series(np.concatenate(df['text_clean'])).value_counts()

In [73]:
rare_words = word_freq[word_freq < 5].index.tolist()

In [74]:
rare_words

['í', 'ü', '«', '»', 'ó', 'à', 'ç', 'ä', 'ô', 'ë', 'á', 'ã', '…']

In [75]:
df['text_final'] = df['text_clean'].apply(lambda x: [word for word in x if word not in rare_words])

In [76]:
df['text_final']

0      [I, n,  , t, h, e,  , n, a, m, e,  , o, f,  , ...
1      [W, e,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
2      [T, h, e,  , A, l, g, e, r, i, a, n,  , p, e, ...
3      [T, h, e,  , A, n, d, o, r, r, a, n,  , P, e, ...
4      [W, e,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
                             ...                        
185    [W, E,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
186    [T, h, e,  , p, e, o, p, l, e,  , o, f,  , V, ...
187    [I, n,  , t, h, e,  , c, o, u, r, s, e,  , o, ...
188    [W, E,  , T, H, E,  , P, E, O, P, L, E,  , O, ...
189    [W, e,  , t, h, e,  , p, e, o, p, l, e,  , o, ...
Name: text_final, Length: 190, dtype: object

### Combine tokens back to text

In [79]:
df['text_final'] = df['text_final'].apply(lambda x: ' '.join(x))

In [80]:
df['text_final']

0      I       n               t       h       e     ...
1      W       e               t       h       e     ...
2      T       h       e               A       l     ...
3      T       h       e               A       n     ...
4      W       e               t       h       e     ...
                             ...                        
185    W       E               t       h       e     ...
186    T       h       e               p       e     ...
187    I       n               t       h       e     ...
188    W       E               T       H       E     ...
189    W       e               t       h       e     ...
Name: text_final, Length: 190, dtype: object

### Display the original and preprocessed text

In [82]:
df[['Preamble', 'text_final']].head()

Unnamed: 0,Preamble,text_final
0,In the name of Allah the Most Beneficent the M...,I n t h e ...
1,We the people of Albania proud and aware of ou...,W e t h e ...
2,The Algerian people are a free people and they...,T h e A l ...
3,The Andorran People with full liberty and inde...,T h e A n ...
4,We the people of Angola through its lawful rep...,W e t h e ...
