## Importy

Pripojenie na Google Colab

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import itertools
import unicodedata, re, string
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Načítanie datasetu

In [3]:
path = '../Data/dataset_5_big.csv'
df = pd.read_csv(path)

In [4]:
author_counts = df['author_id'].value_counts()
print(author_counts)
print('Celkový počet autorov: ', len(df['author_id'].unique()))

author_id
1800    160
520     102
1583    101
251      93
1181     86
       ... 
812       1
1753      1
1301      1
1627      1
1285      1
Name: count, Length: 167, dtype: int64
Celkový počet autorov:  167


## Úprava dát


In [5]:
print(df.isnull().sum())

author_id      0
text         131
dtype: int64


In [6]:
grouped = df.groupby('author_id')

null_counts_by_category = grouped.apply(lambda x: x.isnull().sum())

print(null_counts_by_category)

           author_id  text
author_id                 
3                  0     2
8                  0     0
10                 0     0
18                 0     0
28                 0     0
...              ...   ...
2561               0     0
2569               0     0
3141               0     0
4383               0     0
38191              0     0

[167 rows x 2 columns]


In [7]:
df = df.dropna()

### Odstrátenie konkrétnych autorov
Napríklad, keď obsahujú veľa nulových súborov. IDs sa menia podľa konkrétneho datasetu. Nie je potrebné využívať pri všetkých datasetoch, resp. zoznam môže ostať prázdny.

In [8]:
IDs = [251, 53]

if IDs:
    dataset = df[~df['author_id'].isin(IDs)]
else:
    dataset = df

## Odstránenie autorov, ktorí majú menej ako 5 diel
Data by sa nedali rozdeliť na train/test/valid sady

In [9]:
author_counts = dataset['author_id'].value_counts()
authors_with_multiple_works = author_counts[author_counts > 5]

dataset = dataset[dataset['author_id'].isin(authors_with_multiple_works.index)]

In [10]:
print(dataset.isnull().sum())

author_id    0
text         0
dtype: int64


In [11]:
dataset.head()

Unnamed: 0,author_id,text
1,132,"""It was a good kowl,"" said the leader. ""First ..."
2,132,till the Hall rang and the hounds bayed. De A...
4,132,"'Blast and me moped a good deal, and happen we..."
5,132,"Phil Garron, leaning over the side of the stea..."
6,132,"You talk o' better food for us, an' schools, a..."


In [12]:
print(dataset.shape)

(1784, 2)


In [13]:
author_counts = dataset['author_id'].value_counts()
print(author_counts)
print('Celkový počet autorov: ', len(dataset['author_id'].unique()))

author_id
1800    160
1583    100
1181     86
520      70
213      62
       ... 
10        7
206       6
783       6
755       6
344       6
Name: count, Length: 82, dtype: int64
Celkový počet autorov:  82


## Preprocessing

In [14]:
def remove_non_ascii(words):
    return [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in words]

def to_lowercase(words):
    return [word.lower() for word in words]

def remove_punctuation(words):
    return [re.sub(r'[^\w\s]', '', word) for word in words if word]

def remove_numbers(words):
    return [re.sub("\d+", "", word) for word in words if word]

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_numbers(words)
    words = remove_stopwords(words)
    return words

def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return tweet_blob.words

def preprocess_text(text):
    words = text.split()
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_numbers(words)
    words = remove_stopwords(words)
    return ' '.join(words)

In [15]:
dataset['text_words'] = dataset['text'].apply(form_sentence)

dataset.head()

Unnamed: 0,author_id,text,text_words
1,132,"""It was a good kowl,"" said the leader. ""First ...","[It, was, a, good, kowl, said, the, leader, Fi..."
2,132,till the Hall rang and the hounds bayed. De A...,"[till, the, Hall, rang, and, the, hounds, baye..."
4,132,"'Blast and me moped a good deal, and happen we...","['Blast, and, me, moped, a, good, deal, and, h..."
5,132,"Phil Garron, leaning over the side of the stea...","[Phil, Garron, leaning, over, the, side, of, t..."
6,132,"You talk o' better food for us, an' schools, a...","[You, talk, o, better, food, for, us, an, scho..."


In [16]:
print('\n\nTakto vyzerajú aktuálne slová: ')
for i in range(10):
    print(dataset.text_words.iloc[i])

Output hidden; open in https://colab.research.google.com to view.

In [17]:
dataset['processed_text']  = dataset['text_words'].apply(normalize)

In [18]:
print('\n\nTakto vyzerajú aktuálne slová: ')
for i in range(10):
    print(dataset.processed_text.iloc[i])



Takto vyzerajú aktuálne slová: 
['talk', 'better', 'food', 'us', 'schools', 'fires', "'ll", 'wait', 'extry', 'rations', 'treat', 'us', 'rational', "n't", 'mess', 'cook-room', 'slops', 'prove', 'face', 'widow', "'s", 'uniform', 'soldier-man', "'s", 'disgrace', "'s", 'tommy', 'tommy', 'chuck', 'brute', "'s", 'saviour', "'is", 'country', 'guns', 'begin', 'shoot', "'s", 'tommy', 'tommy', 'anything', 'please', 'tommy', 'ai', "n't", 'bloomin', 'fool', 'bet', 'tommy', 'sees', 'remember', "'re", "'acking", 'round', 'gilded', 'burma', 'god', "'is", 'eyes', 'often', 'precious', 'stones', 'treat', 'nigger', 'dose', "cleanin'-rod", 'e', "'s", 'like', 'show', 'everything', 'e', 'owns', 'e', 'wo', "n't", 'prodooce', 'pour', 'water', 'floor', "'ear", 'answer', "'ollow", 'boot', 'cornet', 'toot', 'toot', 'ground', 'begins', 'sink', 'shove', 'baynick', 'chink', 'must', 'leave', "'im", 'careful', 'e', 'fell', 'may', 'thank', 'stars', 'gaiters', "n't", 'feel', "'is", 'knife', 'ai', "n't", 'told', 'bury

In [19]:
dataset.head()

Unnamed: 0,author_id,text,text_words,processed_text
1,132,"""It was a good kowl,"" said the leader. ""First ...","[It, was, a, good, kowl, said, the, leader, Fi...","[good, kowl, said, leader, first, chaplain, la..."
2,132,till the Hall rang and the hounds bayed. De A...,"[till, the, Hall, rang, and, the, hounds, baye...","[till, hall, rang, hounds, bayed, de, aquila, ..."
4,132,"'Blast and me moped a good deal, and happen we...","['Blast, and, me, moped, a, good, deal, and, h...","['blast, moped, good, deal, happen, n't, behav..."
5,132,"Phil Garron, leaning over the side of the stea...","[Phil, Garron, leaning, over, the, side, of, t...","[phil, garron, leaning, side, steamer, rain, f..."
6,132,"You talk o' better food for us, an' schools, a...","[You, talk, o, better, food, for, us, an, scho...","[talk, better, food, us, schools, fires, 'll, ..."


In [20]:
dataset['processed_text'] = dataset['processed_text'].apply(lambda x: " ".join(x))

In [21]:
dataset.head()

Unnamed: 0,author_id,text,text_words,processed_text
1,132,"""It was a good kowl,"" said the leader. ""First ...","[It, was, a, good, kowl, said, the, leader, Fi...",good kowl said leader first chaplain laughed t...
2,132,till the Hall rang and the hounds bayed. De A...,"[till, the, Hall, rang, and, the, hounds, baye...",till hall rang hounds bayed de aquila held han...
4,132,"'Blast and me moped a good deal, and happen we...","['Blast, and, me, moped, a, good, deal, and, h...",'blast moped good deal happen n't behave well ...
5,132,"Phil Garron, leaning over the side of the stea...","[Phil, Garron, leaning, over, the, side, of, t...",phil garron leaning side steamer rain felt unh...
6,132,"You talk o' better food for us, an' schools, a...","[You, talk, o, better, food, for, us, an, scho...",talk better food us schools fires 'll wait ext...


In [22]:
nan_values = dataset.isnull().sum()
nan_values

author_id         0
text              0
text_words        0
processed_text    0
dtype: int64

In [23]:
new_dataset = dataset[['author_id', 'processed_text']]

In [24]:
new_dataset.head()

Unnamed: 0,author_id,processed_text
1,132,good kowl said leader first chaplain laughed t...
2,132,till hall rang hounds bayed de aquila held han...
4,132,'blast moped good deal happen n't behave well ...
5,132,phil garron leaning side steamer rain felt unh...
6,132,talk better food us schools fires 'll wait ext...


## Vytvorenie nového csv súboru

In [25]:
new_name = 'final_dataset_5.csv'
new_dataset.to_csv(new_name, index=False)

## Stiahnutie súboru


In [26]:
from google.colab import files
files.download(new_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>