<a href="https://colab.research.google.com/github/strangledzelda/thesis/blob/main/text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

url = 'https://raw.githubusercontent.com/strangledzelda/thesis/main/cyberbulling.csv'
data = pd.read_csv(url, encoding="utf-8")

In [None]:
sw = pd.read_csv('https://raw.githubusercontent.com/strangledzelda/thesis/main/all_stopwords.csv', encoding="utf-8")
stopwords = sw.values.tolist()
all_stopwords = []
for word in stopwords:
    all_stopwords.append(word[0])

In [None]:
!pip install pymorphy2
!pip install pyaspeller

In [None]:
import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
from pyaspeller import YandexSpeller
speller = YandexSpeller()

In [None]:
def text_preprocessing(text, to_list=False):
    
    
    # уберём все символы кроме кириллицы и пробелов
    text = re.sub(r'[^А-Яа-я\s:]','',text)
    
    text = re.sub(r'\n','',text)
    
    # приведём к нижнему регистру
    text = text.lower()
    
    # убираем слова меньше 3 символов
    text = re.sub(r'\W*\b\w{1,2}\b','',text)
    
    # удалим повторяющиеся подряд буквы (3 и больше)
    text = re.sub("(.)\\1{2,}", "\\1", text)

    # сделаем так, чтобы разделение на слова происходило не только через пробелы,
    # но и через дефисы
    text = re.split(' |-', text)
    
    # удалим повторяющиеся слова в предложении
    text = sorted(set(text), key=text.index)
    
    # применим лемматизатор, удалим стоп-слова, исправим опечатки
    lemlist = ['']
    for word in text:
        word = speller.spelled(word)
        if morph.parse(word)[0].normal_form not in all_stopwords:
            lemlist.append(morph.parse(word)[0].normal_form)
    
    lemlist = lemlist[1:]

    # удалим пустые строки из списка 
    clean_txt = list(filter(None, lemlist))
    if to_list:
        return clean_txt
    else:
        return ' '.join(clean_txt)

In [None]:
df1 = data.comment.iloc[:1001]
df2 = data.comment.iloc[1001:2001]
df3 = data.comment.iloc[2001:3001]
df4 = data.comment.iloc[3001:4001]
df5 = data.comment.iloc[4001:5001]
df6 = data.comment.iloc[5001:6001]
df7 = data.comment.iloc[6001:7001]
df8 = data.comment.iloc[7001:8001]
df9 = data.comment.iloc[8001:9001]
df10 = data.comment.iloc[9001:10001]
df11 = data.comment.iloc[10001:11001]
df12 = data.comment.iloc[11001:12001]
df13 = data.comment.iloc[12001:13001]
df14 = data.comment.iloc[13001:14411]

In [None]:
all_stopwords.append('априори')

In [None]:
%%time
df8 = df8.apply(text_preprocessing)

In [None]:
from google.colab import files

df8.to_csv(r'df8.csv', index=False, encoding='utf-8') 
files.download(r'df8.csv')

In [None]:
%%time
df9 = df9.apply(text_preprocessing)

In [None]:
from google.colab import files

df9.to_csv(r'df9.csv', index=False, encoding='utf-8') 
files.download(r'df9.csv')