# Cleaning and Basic Text Analysis

Methods for cleaning the segmented and disaggregated text files and performing word counts, chapter counts, stopword removal, and other basic frequency calculations and enrichment processes. 

## Install Packages and Upload Files

In [None]:
import os
import pandas as pd

import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

import re

#Get dictionary of English words to keep 
from nltk.corpus import words
#nltk.download('words')
#nltk.download('wordnet')
from nltk import WordNetLemmatizer

from collections import Counter

In [None]:
#Get current working directory 
path = os.getcwd()
print(path)

#Change working directory to location of segmented texts
path = os.chdir("/PATHNAME")

#Upload dataframe with segmented texts
clean_df = pd.read_csv('FILENAME.csv')

#Drop first column (unnamed)
clean_df = clean_df.iloc[: , 1:]

#Make text column string values
clean_df['Text'] = clean_df['Text'].astype(str)

clean_df

## Basic Cleaning

In [None]:
#Lowercase all words
clean_df['Clean_Text'] = clean_df['Text'].str.lower()
clean_df

In [None]:
p = re.compile(r'[^\w\s]+')
clean_df['Clean_Text'] = [p.sub(' ', x) for x in clean_df['Clean_Text'].tolist()]
clean_df

In [None]:
#Remove extraneous whitespace using regular expressions
clean_df['Clean_Text'] = clean_df['Clean_Text'] .str.replace('  +', ' ', regex=True)
clean_df

In [None]:
#Remove numbers and extraneous characters
clean_df['Clean_Text'] = clean_df['Clean_Text'] .str.replace('\d+', '', regex=True)
clean_df['Clean_Text'] = clean_df['Clean_Text'] .str.replace('_', '')
clean_df

In [None]:
#Change path to where you want to save the files
path = os.chdir("/PATHNAME")

#Save cleaned dataframe to working directory
clean_df.to_csv('FILENAME.csv', index=False)

## Advanced Cleaning: Stopword Removal, Lemmatization and Keep Only English Words

In [None]:
#Make new dataframe for advanced cleaning
adv_clean_df = clean_df.copy()

In [None]:
#Remove stopwords
stop_words = set(stopwords.words("english"))
adv_clean_df['Text_NoStops'] = adv_clean_df['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
adv_clean_df

In [None]:
#Define list of words to keep from nltk words
#Set function will make processing faster
words_list = words.words()
my_words = set(words.words())

In [None]:
#Words corpus does not contain plural forms, must lemmatize first
#nltk.download('omw-1.4')
wnl = WordNetLemmatizer()

#Can choose to lemmatize clean text with or without stopwords
adv_clean_df['Text_Lemmas'] = adv_clean_df['Clean_Text'].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in x.split() ]))
adv_clean_df

In [None]:
#Add all text to a list of strings
adv_clean_df['Text_Lemmas'] = adv_clean_df['Text_Lemmas'].astype(str)
data = adv_clean_df.Text_Lemmas.values.tolist()
data

In [None]:
#Append each word in list of strings to list of words
all_words = []

for text in data:
    word = text.split()
    all_words.append(word)
    
import itertools
all_words_list = list(itertools.chain(*all_words))
len(all_words_list)

In [None]:
#Keep only words in lemma list that are also in words corpus
adv_clean_df['English_Text'] = adv_clean_df['Text_Lemmas'].apply(lambda x: ' '.join([word for word in x.split() if word in (my_words)]))
adv_clean_df

In [None]:
#Add all English text to a list of strings
adv_clean_df['English_Text'] = adv_clean_df['English_Text'].astype(str)
kept_data = adv_clean_df.English_Text.values.tolist()
kept_data

In [None]:
#Append each word in list of strings to list of words
kept_words = []

for text in kept_data:
    word = text.split()
    kept_words.append(word)
    
import itertools
kept_words_list = list(itertools.chain(*kept_words))
len(kept_words_list)

In [None]:
#Calculate the number of words that have been removed from the text
removed_list = set(all_words_list) - set(kept_words_list)
len(removed_list)

In [None]:
#Examine what words were removed from the text
removed_list= list(removed_list)
removed_list.sort()
removed_list

In [None]:
#Put the list of removed words into a dataframe
col_name = ['Removed Words']
removed_words_df = pd.DataFrame(removed_list, columns = col_name)
removed_words_df

In [None]:
#Clean stopwords from list of English text and add to new column
adv_clean_df['English_Text_NoStops'] = adv_clean_df['English_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
adv_clean_df

In [None]:
#Change path to where you want to save the files
path = os.chdir("/PATHNAME")

#Save dataframe with kept words and titles
adv_clean_df.to_csv('FILENAME.csv', index=False)

#Saved removed words dataframe to working directory
removed_words_df.to_csv('Removed_Words.csv', index=False)

## Basic Text Analysis

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.text import Text

nltk.download('punkt')

In [None]:
#Get new dataframe to work with
df_counts = adv_clean_df.copy()

In [None]:
#Get number of words in each chunk
#Make sure to use original texts (not cleaned)
ch_words = df_counts["Text"].apply(lambda x: len(str(x).split(' ')))

#Append word counts of each chapter chunk to dataframe
df_counts["Word Count"] = ch_words
df_counts

In [None]:
#Get most frequent words across the dataframe
#Use text in English/without stopwords
Counter(" ".join(df_counts["Text_NoStops"]).split()).most_common(20)

In [None]:
## Concordancing

# Convert the 'text' column to a list of tokens for each row
df_counts['tokens'] = df_counts['text'].apply(word_tokenize)

# Create an NLTK Text object for concordancing
text_object = Text(word for tokens in df['tokens'] for word in tokens)

# Define the target word
target_word = "sentence"

# Specify the number of words before and after the target word for concordancing
context_size = 5

# Perform concordancing
concordance_list = text_object.concordance_list(target_word, width=context_size * 2)

# Display the concordance lines
for line in concordance_list:
    print(line.line)

## Text Analysis with SpaCy

In [None]:
# Install and import spacy
!pip install spaCy

# Install English language model
!spacy download en_core_web_sm

In [None]:
# Install and import spacy
import spacy

# Load spaCy visualizer
from spacy import displacy

# Import os to upload documents and metadata
import os

# Import pandas DataFrame packages
import pandas as pd

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

In [None]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

In [None]:
# Create new df for analysis
#Get new dataframe to work with
df_spaCy = clean_df.copy()

In [None]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [None]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
df_spaCy['Doc'] = df_spaCy['Text'].apply(process_text)

In [None]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

# Run the token retrieval function on the doc objects in the dataframe
df_spaCy['Tokens'] = df_spaCy['Doc'].apply(get_token)
df_spaCy.head()

In [None]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
df_spaCy['Lemmas'] = df_spaCy['Doc'].apply(get_lemma)

In [None]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
df_spaCy['POS'] = df_spaCy['Doc'].apply(get_pos)

In [None]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
df_spaCy['Proper_Nouns'] = df_spaCy['Doc'].apply(extract_proper_nouns)

In [None]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
df_spaCy['Named_Entities'] = df_spaCy['Doc'].apply(extract_named_entities)
df_spaCy['Named_Entities']

In [None]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
df_spaCy['NE_Words'] = df_spaCy['Doc'].apply(extract_named_entities)
df_spaCy['NE_Words']

In [None]:
# Extract the first Doc object
doc = df_spaCy['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)