# Data gathering and cleaning

## Before running the LDA model, we'll need to get the data into a clean, useable format:

- Get dataframe of texts from Academy urls 
- Create corpus(text must be combined per url, for now). df(url:combined_content)
- Perform first round of data cleaning
    - unwanted symbols
    - make lowercase
    - remove numbers
- Second round of cleaning
    - lemmatisation
    - stemming
    - remove stop words

## 1. Import Academy texts dataframe 

In [None]:
import pandas as pd
import pickle

In [None]:
df = pd.read_csv('../04_Data/processed_posts.csv', index_col=0)

In [None]:
df.head(30)

In [None]:
df.info()

## 2. Create corpus

We will use the initial setup (looking at specific html tags) as well as the per article analysis (which is more useful for EDA). For that we need to drop the 'tag', 'title' and 'published' column and combine rows with the same url.

### 2.1 Corpus with articles separated by tags

In [None]:
# Keeping important columns

df_separated = df[['url','content']]

In [None]:
df_separated

In [None]:
# Everything looks good so far, let's just pick a random article
# and check that everything is okay

df_separated.loc[45].content

In [None]:
with open('pkl_cellar/corpus_separated.pkl', 'wb') as ds:
    pickle.dump(df_separated, ds, protocol=pickle.HIGHEST_PROTOCOL)

### 2.2 Corpus with articles as a whole
This is mostly for EDA, to make sure the cleaned data offers valid results

In [None]:
# Combining rows with the same url
# Should only have 159 rows

df_combined = df_separated.groupby(['url'])['content'].apply(' '.join).reset_index()

In [None]:
df_combined

In [None]:
df_combined.loc[45].content

In [None]:
# Let's pickle and move on

with open('pkl_cellar/corpus_combined.pkl', 'wb') as dc:
    pickle.dump(df_combined, dc, protocol=pickle.HIGHEST_PROTOCOL)

## 3. Data cleaning round 1!

Converting to lower case, get rid of punctuation and numbers

In [None]:
test = df_separated.loc[45].content

In [None]:
import re
import string

In [None]:
def cleaning_round1(text):
    '''lowercase, remove punctuation, remove \xa0, remove numbers + words with numbers'''
    
    text = text.lower()
    text = re.sub('-', ' ', text)
    
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[”“–‘’]', '', text )
    
    return text

In [None]:
test_round1 = cleaning_round1(test)
test_round1

## 4. Data cleaning round 2!

The big guns are coming out: lemmatisation, tokenization, stopword removal. (perhaps bigrams and trigrams at later stage)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy

### 4.1 Using lemmatization to reduce words to their root words

In [None]:
# lemmatisation

def lemmatizer(text):
    nlp = spacy.load('en_core_web_sm')

    text_out = []
    tokens = nlp(text)
    text_out = [" ".join(token.lemma_ for token in tokens)]
    
    text_out = [re.sub('-PRON-', 'i', str(text)) for text in text_out]
    
    return "".join(text_out)

In [None]:
text_lem = lemmatizer(test_round1)
text_lem

### 4.2 Tokenizing (separating words)

In [None]:
# tokenize texts

def tokenize(text):
    
    words = word_tokenize(text)
    
    return words

In [None]:
text_tkn = tokenize(text_lem)

In [None]:
text_tkn

### 4.3 Removing stop words from tokenized list

In [None]:
# remove stopwords

def remove_stopwords(text):
    '''Takes tokenized (list) of words and removes stopwords'''
    
    stp_words = set(stopwords.words('english'))
    
    no_stp_words = [word for word in text if word not in stp_words]

    return ' '.join(no_stp_words)

In [None]:
text_no_sw = remove_stopwords(text_tkn)

In [None]:
text_no_sw

## 5. Running all texts through cleaner

- Step 1: Create one function to clean a test list
- Step 2: Test it!
- Step 3: Run entire set through cleaner
- Step 4: Pickle it!

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
test_df = df_separated.loc[[6, 37, 28]]

In [None]:
test_df

In [None]:
def nlp_cleaner(df):
    
    df['content'] = df['content'].apply(cleaning_round1)
    
    # cleaning round 2
    df['content'] = df['content'].apply(lemmatizer)
    df['content'] = df['content'].apply(tokenize)
    df['content'] = df['content'].apply(remove_stopwords)
    
    return df

In [None]:
nlp_cleaner(test_df)

### 5.1 Everything works, time to take the entire df_separated to the cleaners
... then it's pickle time

In [None]:
# needs about an hour to run

#lda_df = nlp_cleaner(df_separated)

In [None]:
with open('pkl_cellar/corpus_processed.pkl', 'rb') as file:
    lda_df = pickle.load(file)

lda_df

### 5.2 Now using nlp_cleaner( ) on df_combined

In [None]:
# takes about 5 minutes to run

lda_combined_articles = nlp_cleaner(df_combined)

In [None]:
lda_combined_articles

In [None]:
# with open('pkl_cellar/corpus_processed_full_articles.pkl', 'wb') as fa:
#    pickle.dump(lda_combined_articles, fa, protocol=pickle.HIGHEST_PROTOCOL)