In [2]:
import pandas as pd
from IPython.display import clear_output
from TextAnalysis.comments_extractor import comments_extractor
from TextAnalysis.preprocessor import TextPreprocessor
from TextAnalysis.descriptive_statistics import word_length_distribution, calculate_word_statistics, plot_most_frequent_words
from TextAnalysis.words_replacing import WordReplacer
from TextAnalysis.stem_and_lem  import TextStemLem
from TextAnalysis.pre_prepocessing import hash_text, change_time_format
from TextAnalysis.word_mapping import word_mapping

clear_output()

---

# Exctraction of comments using Reddit API

In [None]:
# Two distinct dataframes are created. It takes a few minute to extract all comments, 
# so consider making tea in the meantime

comments_after_win = comments_extractor('ACMilan', '17pzwvv') 
comments_after_lose = comments_extractor('ACMilan', '17gb1xz')

In [6]:
comments_after_lose = pd.read_csv('milan_lost.csv')
comments_after_win = pd.read_csv('milan_win.csv')


You can check that everything is ok just by writing the name(s) of a DataFrame and running the code. Before that, however, I want to apply two functions that allow me to present the better version of the data:
- I want to hash the names of people leaving commets
- change the date_time representation so that it is more user-friendly

To do so, I use functions from the "pre_preprocessing" module from the TextAnalysis Folder.

In [7]:
# Firstly I hash the 'author' columns in both dataframes
comments_after_win = hash_text(comments_after_win,column_name='author')
comments_after_lose = hash_text(comments_after_lose,column_name='author')

# Then I change the format of the representation of date and time
comments_after_win = change_time_format(comments_after_win,column_name='created_utc')
comments_after_lose = change_time_format(comments_after_lose,column_name='created_utc')

---

### The DataFrames afrer pre_prepocessing:

In [None]:
comments_after_lose.head(5)

In [None]:
comments_after_win.head(5)

 If you do not want to run the same command over and over again, consider downloading data 
to a csv file and name it the way you like. After that just assign in to a variable. For example:

```py
comments_after_lose.to_csv('milan_lost.csv', index=False)
sad_comments = pd.read_csv('milan_lost.csv')
```

I will continue with the previous name.

---

# Preprocessing Stage

#### Now we use quite a few function to preprocess data
Consult with the "preprocessor", "words_replacing"and "stem_and_lem" modules in the TextAnalysis folder. All the documentation is written there.

After the execution of the comments we expect the text to be:
- In lower case;
- Without any links;
- Without punctuation;
- Without stopwords (words that don't have much meaning);
- Without any special characters;
- Contaning only words with more than 2 letters (to avoid nonsense words that might appear during previous stages);
- With curse words masked;
  
Additionally, I manually replace some of the words as long as they refer to the same thing(person) but are treated differently when analyzing data. Mainly, it concerns surnames of the players. For example, Christian Pulisic can be addressed by "Puli" and "Pulisich" by the fans of Milan; or "RCL" is a "Ruben Loftus-Cheek". No python library can trace it, so I have to manually change it if I notice something. It can surely impact the analysis of the comments, but there is no recearch withou limitations.  For that, I use "words_replacing" module in the TextAnalysis folder.

Lastly, I want to add two columns to the dataframe: one with stemmed words and another with lemmatized words. Those are nedded as I want to compare text of I receive after preprocessing with even "purer" data to have just a better view of the situation. For that, I use "stem_and_lem" module in the TextAnalysis folder.

We have to make instances of the class TextProcessor to apply all the functions:

In [8]:
preprocessor_lose = TextPreprocessor(comments_after_lose)
preprocessor_win = TextPreprocessor(comments_after_win)

After that we create dataframes out of the instances:

In [9]:
# body -- the name of the column that contains the text of the comments
preprocessed_lose_df = preprocessor_lose.apply_all('body')
preprocessed_win_df = preprocessor_win.apply_all('body')

Now we can look what we get:

In [None]:
preprocessed_lose_df.head(5)

In [None]:
preprocessed_win_df.head(5)

Now let's turn to the replacement of the word that refer to the same person:

In [11]:
# We firstly create two instances of the WordReplacer class
word_replacer_win = WordReplacer(preprocessed_win_df)
word_replacer_lose = WordReplacer(preprocessed_lose_df)

#Then we create two dataframes (I name them the same way to avoid the confusion)
# and we use the dictionary 'word_mapping' from the respective module.
preprocessed_win_df = word_replacer_win.replace_words('body', word_mapping)
word_replacer_lose = word_replacer_lose.replace_words('body', word_mapping)

In [None]:
preprocessed_win_df.head(5) #It works, as we have "leao" instead of "rafa".

##### Now we can proceede with adding columns with stemmed and lemmatized words:

In [12]:
%%capture

# We firstly create two instances of the TextStemLem class:
tsl_w = TextStemLem(preprocessed_win_df)
tsl_l = TextStemLem(preprocessed_lose_df)

# Then we add a column with stemmed words
tsl_w.stem_words('body')
tsl_l.stem_words('body')

# Finally, we add a column with lemmatized words
tsl_w.lemmatize_words('body')
tsl_l.lemmatize_words('body')

In [13]:
preprocessed_win_df = tsl_w.dataframe
preprocessed_lose_df = tsl_l.dataframe

In [15]:
preprocessed_win_df.head(5) # As you can see, everything works
# and we are done with the preprocessing part

Unnamed: 0,author,body,created_datetime,stemmed_words,lemmatized_words
0,e747f9104ee94c81cea8882e72569ff2ffaf7d7d0bf760...,match thread todays match testing new feature ...,2023-11-07 17:47:32,match thread today match test new featur reddi...,match thread today match testing new feature r...
1,c781e576544f18d1a772440f1e38b705e047cb0360936a...,rlc different type animal today holy sh****,2023-11-07 22:02:38,rlc differ type anim today holi sh****,rlc different type animal today holy sh****
2,b7aa5347ca21d3a470856780720af01c6f2c9581185542...,leao world class tonight running like crazy,2023-11-07 21:42:52,leao world class tonight run like crazi,leao world class tonight running like crazy
3,e52216b2a493bd566000fde19217047c63ff6e1f21c1c1...,rlc signing summer confirmed,2023-11-07 20:51:49,rlc sign summer confirm,rlc signing summer confirmed
4,439667f389bbc1317e68675bfd5f5080d800f2fc7df6b9...,calabria pocketing superstar,2023-11-07 21:59:58,calabria pocket superstar,calabria pocketing superstar


---