## Text Preprocessing in NLP with Python

In [6]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions
import string
string.punctuation
import re

In [11]:
input = open("NLP for Uzbek/test_text.txt", 'r', encoding="utf8")
df = input.read()

In [12]:
df

"'«NBU-Osiyo» — «Metallurg» 0:3 (0:2). Gollar: Mirzakamol Kamolov (14), Abdulloh Olimov (51, 88), Humoyun Murtazoyev (52), Shahzod Shaymanov (83). «Andijon» — «Neftchi» 1:1 (0:0). Chetlatishlar: Ibrohim Otaxonov (79) — Muzaffar Muzaffarov (85). «Navbahor» — «Xorazm» 5:0 (1:0). «Surxon» — «Qo\\\\\\'qon-1912» 0:2 (0:1). Yevropada saralash o\\\\\\'yinlari davom ettirildi . Yevropa mintaqasida JCh-2018 yilda Rossiyada bo\\\\\\'lib o\\\\\\'tadigan jahon chempionatiga saralash bosqichi beshinchi tur uchrashuvlari davom etmoqda. Kunning birinchi qismidan o\\\\\\'rin olgan beshta uchrashuv nihoyasiga yetdi. Bosniya terma jamoasi UYeFAning so\\\\\\'nggi a\\\\\\'zolaridan biri Gibraltar terma jamoasini qabul qilib, javobsiz 5ta to\\\\\\'p kiritgan bo\\\\\\'lsa, Shvesiya Belarus darvozasini 4 bor aniq nishonga oldi. Shuningdek, Shveysariya og\\\\\\'ir kechgan uchrashuvda Latviyadan ustun keldi. Yana ikki uchrashuvda durang natijasi qayd etildi. JCh-2018, saralash bosqichi, Yevropa mintaqasi. Goll

## 1. Remove HTML tags

In [13]:
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

In [14]:
df1 = strip_html_tags(df)

In [15]:
df1

"'«NBU-Osiyo» — «Metallurg» 0:3 (0:2). Gollar: Mirzakamol Kamolov (14), Abdulloh Olimov (51, 88), Humoyun Murtazoyev (52), Shahzod Shaymanov (83). «Andijon» — «Neftchi» 1:1 (0:0). Chetlatishlar: Ibrohim Otaxonov (79) — Muzaffar Muzaffarov (85). «Navbahor» — «Xorazm» 5:0 (1:0). «Surxon» — «Qo\\\\\\'qon-1912» 0:2 (0:1). Yevropada saralash o\\\\\\'yinlari davom ettirildi . Yevropa mintaqasida JCh-2018 yilda Rossiyada bo\\\\\\'lib o\\\\\\'tadigan jahon chempionatiga saralash bosqichi beshinchi tur uchrashuvlari davom etmoqda. Kunning birinchi qismidan o\\\\\\'rin olgan beshta uchrashuv nihoyasiga yetdi. Bosniya terma jamoasi UYeFAning so\\\\\\'nggi a\\\\\\'zolaridan biri Gibraltar terma jamoasini qabul qilib, javobsiz 5ta to\\\\\\'p kiritgan bo\\\\\\'lsa, Shvesiya Belarus darvozasini 4 bor aniq nishonga oldi. Shuningdek, Shveysariya og\\\\\\'ir kechgan uchrashuvda Latviyadan ustun keldi. Yana ikki uchrashuvda durang natijasi qayd etildi. JCh-2018, saralash bosqichi, Yevropa mintaqasi. Goll

## 2. Remove extra whitespaces

In [16]:
def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())

In [17]:
df2 = remove_whitespace(df1)

In [18]:
df2

"'«NBU-Osiyo» — «Metallurg» 0:3 (0:2). Gollar: Mirzakamol Kamolov (14), Abdulloh Olimov (51, 88), Humoyun Murtazoyev (52), Shahzod Shaymanov (83). «Andijon» — «Neftchi» 1:1 (0:0). Chetlatishlar: Ibrohim Otaxonov (79) — Muzaffar Muzaffarov (85). «Navbahor» — «Xorazm» 5:0 (1:0). «Surxon» — «Qo\\\\\\'qon-1912» 0:2 (0:1). Yevropada saralash o\\\\\\'yinlari davom ettirildi . Yevropa mintaqasida JCh-2018 yilda Rossiyada bo\\\\\\'lib o\\\\\\'tadigan jahon chempionatiga saralash bosqichi beshinchi tur uchrashuvlari davom etmoqda. Kunning birinchi qismidan o\\\\\\'rin olgan beshta uchrashuv nihoyasiga yetdi. Bosniya terma jamoasi UYeFAning so\\\\\\'nggi a\\\\\\'zolaridan biri Gibraltar terma jamoasini qabul qilib, javobsiz 5ta to\\\\\\'p kiritgan bo\\\\\\'lsa, Shvesiya Belarus darvozasini 4 bor aniq nishonga oldi. Shuningdek, Shveysariya og\\\\\\'ir kechgan uchrashuvda Latviyadan ustun keldi. Yana ikki uchrashuvda durang natijasi qayd etildi. JCh-2018, saralash bosqichi, Yevropa mintaqasi. Goll

## 3. Convert accented characters to ASCII characters

In [19]:
def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text

In [20]:
df3 = remove_accented_chars(df2)

In [21]:
df3

"'<<NBU-Osiyo>> -- <<Metallurg>> 0:3 (0:2). Gollar: Mirzakamol Kamolov (14), Abdulloh Olimov (51, 88), Humoyun Murtazoyev (52), Shahzod Shaymanov (83). <<Andijon>> -- <<Neftchi>> 1:1 (0:0). Chetlatishlar: Ibrohim Otaxonov (79) -- Muzaffar Muzaffarov (85). <<Navbahor>> -- <<Xorazm>> 5:0 (1:0). <<Surxon>> -- <<Qo\\\\\\'qon-1912>> 0:2 (0:1). Yevropada saralash o\\\\\\'yinlari davom ettirildi . Yevropa mintaqasida JCh-2018 yilda Rossiyada bo\\\\\\'lib o\\\\\\'tadigan jahon chempionatiga saralash bosqichi beshinchi tur uchrashuvlari davom etmoqda. Kunning birinchi qismidan o\\\\\\'rin olgan beshta uchrashuv nihoyasiga yetdi. Bosniya terma jamoasi UYeFAning so\\\\\\'nggi a\\\\\\'zolaridan biri Gibraltar terma jamoasini qabul qilib, javobsiz 5ta to\\\\\\'p kiritgan bo\\\\\\'lsa, Shvesiya Belarus darvozasini 4 bor aniq nishonga oldi. Shuningdek, Shveysariya og\\\\\\'ir kechgan uchrashuvda Latviyadan ustun keldi. Yana ikki uchrashuvda durang natijasi qayd etildi. JCh-2018, saralash bosqichi, Ye

## 4. Remove special characters

In [22]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [23]:
df4 = remove_punctuation(df3)

In [24]:
df4

'NBUOsiyo  Metallurg 03 02 Gollar Mirzakamol Kamolov 14 Abdulloh Olimov 51 88 Humoyun Murtazoyev 52 Shahzod Shaymanov 83 Andijon  Neftchi 11 00 Chetlatishlar Ibrohim Otaxonov 79  Muzaffar Muzaffarov 85 Navbahor  Xorazm 50 10 Surxon  Qoqon1912 02 01 Yevropada saralash oyinlari davom ettirildi  Yevropa mintaqasida JCh2018 yilda Rossiyada bolib otadigan jahon chempionatiga saralash bosqichi beshinchi tur uchrashuvlari davom etmoqda Kunning birinchi qismidan orin olgan beshta uchrashuv nihoyasiga yetdi Bosniya terma jamoasi UYeFAning songgi azolaridan biri Gibraltar terma jamoasini qabul qilib javobsiz 5ta top kiritgan bolsa Shvesiya Belarus darvozasini 4 bor aniq nishonga oldi Shuningdek Shveysariya ogir kechgan uchrashuvda Latviyadan ustun keldi Yana ikki uchrashuvda durang natijasi qayd etildi JCh2018 saralash bosqichi Yevropa mintaqasi Gollar Ibishevich 4 43 Vrshayevich 52 Vishcha 56 Bichakchich 90 Ogohlantirish Kolashinas 18 Ogohlantirishlar Vasilyev 68 Laban 78 Shveysariya  Latviya 1

## 5. Lowercase all texts and Remove numbers

In [25]:
def remove_numbers_lowercase(text):
    text = text.lower()
    newtext = re.sub(r'[0-9]+', '', text)
    return newtext

In [26]:
df5 = remove_numbers_lowercase(df4)

In [27]:
df5

'nbuosiyo  metallurg   gollar mirzakamol kamolov  abdulloh olimov   humoyun murtazoyev  shahzod shaymanov  andijon  neftchi   chetlatishlar ibrohim otaxonov   muzaffar muzaffarov  navbahor  xorazm   surxon  qoqon   yevropada saralash oyinlari davom ettirildi  yevropa mintaqasida jch yilda rossiyada bolib otadigan jahon chempionatiga saralash bosqichi beshinchi tur uchrashuvlari davom etmoqda kunning birinchi qismidan orin olgan beshta uchrashuv nihoyasiga yetdi bosniya terma jamoasi uyefaning songgi azolaridan biri gibraltar terma jamoasini qabul qilib javobsiz ta top kiritgan bolsa shvesiya belarus darvozasini  bor aniq nishonga oldi shuningdek shveysariya ogir kechgan uchrashuvda latviyadan ustun keldi yana ikki uchrashuvda durang natijasi qayd etildi jch saralash bosqichi yevropa mintaqasi gollar ibishevich   vrshayevich  vishcha  bichakchich  ogohlantirish kolashinas  ogohlantirishlar vasilyev  laban  shveysariya  latviya   ogohlantirishlar lazdinsh  fernandes  sher  freymanis  ogo