# Text Mining: Processing text data
* Python String and text basics
* Regular Expression
* Using Spacy for text normalization & parsing

## Python String and Text Basics

In [100]:
#define a variable

text = 'Welcome to LIS 501!'
text

'Welcome to LIS 501!'

In [101]:
#a text is just a sequence of characters (including whitespace and other punctuations)

characters = list(text)

#characters = [char for char in text]
characters

['W',
 'e',
 'l',
 'c',
 'o',
 'm',
 'e',
 ' ',
 't',
 'o',
 ' ',
 'L',
 'I',
 'S',
 ' ',
 '5',
 '0',
 '1',
 '!']

In [102]:
text.lower()

'welcome to lis 501!'

In [103]:
text.upper()

'WELCOME TO LIS 501!'

In [104]:
text.title()

'Welcome To Lis 501!'

In [105]:
text.capitalize()

'Welcome to lis 501!'

In [106]:
#concatenate strings
'Welcome' + ' ' + 'to' + ' ' + 'LIS' + ' ' + '501' + ' ' + '!'

'Welcome to LIS 501 !'

In [107]:
#concatenate a list of string with a separator
' '.join(['Welcome','to','LIS','501','!'])

'Welcome to LIS 501 !'

In [108]:
# prompt: how does .join() work?

# .join() takes an iterable (like a list or tuple) of strings as input.
# It concatenates the strings in the iterable, using the string that .join() is called on as a separator between each string.

# Example:
my_list = ["Hello", "world", "!"]
separator = " "
joined_string = separator.join(my_list)
print(joined_string)  # Output: Hello world !

# In this example, the " " (space) is used as a separator to join the strings in my_list.


Hello world !


In [109]:
#' '.join(['Welcome','to','LIS','501','!'])

my_list2 = ['Welcome','to','LIS','501','!']
separator2 = ' '
joined_string2 = separator.join(my_list)

joined = joined_string2

In [110]:
print(my_list, separator, joined_string)

['Hello', 'world', '!']   Hello world !


In [111]:
joined*3

'Hello world !Hello world !Hello world !'

In [112]:
len(joined)

13

In [113]:
joined[0:3]

'Hel'

In [114]:
joined[0:-3]

'Hello worl'

In [115]:
joined.split()

['Hello', 'world', '!']

In [116]:
text = "Welcome to LIS 501, Mike!"

In [117]:
text.split(',')

['Welcome to LIS 501', ' Mike!']

## In-class Exercise 1

In [118]:
name = 'First Last'
name

'First Last'

In [119]:
# Class Exercise: get first & last name initials
# For example, if name = 'Peter Pan', you should get 'PP'

#step 1: split the name
#aim to get ['First', 'Last']

name.split()

['First', 'Last']

In [120]:
#step 2: get the first letter of each part
#aim to get ['F', 'L']

[b[0] for b in name.split()]

['F', 'L']

In [121]:
fullname = name.split()

In [122]:
[word[0] for word in fullname]

['F', 'L']

In [123]:
fullname = "First Last"

In [124]:
name = fullname.split()
name


['First', 'Last']

In [125]:
print(name[0][0], name[1][0])

F L


## Regular Expression

Just some basic examples; more tutorial can be found at: https://www.w3schools.com/python/python_regex.asp

In [126]:
import re

text = 'Welcome to LIS 501!'

# [A-Z] matches any uppercase character
# [A-Z]+ matches any uppercase character appearing one too many times
# found all substrings that are all upercase

In [127]:
re.findall("[A-Z]", text)

['W', 'L', 'I', 'S']

In [128]:
re.findall("[A-Z]+", text)

['W', 'LIS']

In [129]:
# prompt: Give me a regex for find every sentence with the word "data"

import re

text = "This is some data. More data here. No sight."
pattern = "[^.?!]*(?<=[.?\s!])data(?=[\s.?!])[^.?!]*[.?!]"

matches = re.findall(pattern, text)
print(matches)


['This is some data.', ' More data here.']


  pattern = "[^.?!]*(?<=[.?\s!])data(?=[\s.?!])[^.?!]*[.?!]"


In [130]:
# [0-9] matches any digit
# [0-9]+ matches any digit appearing one to many times
# found all substrings that are all digits

In [131]:
# . matches any character
# found all substrings that starts with a 'W' and ends with an 'o'; "greedy" match, match as much text as possible

In [132]:
# found all substrings that starts with a 'W' and ends with an 'o'; "reluctant" match, match as few text as possible

In [133]:
# \s matches any whitespace
# split by 1 or more whitespace

In [134]:
# replace 1 or more whitespace by [WHITESPACE]

## Using Spacy for Text Normalization and Parsing
- Sentence segmentation, tokenization, stop words removal, lemmatization (stemming)
- POS tagging, NP chunking, Named Entity Recognition
- https://spacy.io/

In [135]:
# make sure the required python packages are installed

# install nltk (we'll use 3.6.7 in Fall 2022)
!pip install nltk==3.6.7 --upgrade

# install spacy (we'll use 3.2.1 in Fall 2022)
!pip install spacy==3.2.1 --upgrade

# download the spacy en_core_web_sm model (3.2.0 version)
!python -m spacy download en_core_web_sm-3.2.0 --direct

Collecting spacy==3.2.1
  Using cached spacy-3.2.1.tar.gz (1.1 MB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install build dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Installing build dependencies ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mpip subprocess to install build dependencies[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [136]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

In [137]:
raw = "Charlie Brown is a beloved character from the comic strip Peanuts, created by Charles Schulz. Known for his kind heart and persistent optimism, Charlie often finds himself facing challenges, whether it's losing at baseball or missing his chance to kick a football. Despite his frequent failures, he continues to try again, embodying a sense of resilience and hope. His loyal friends, like Linus and Snoopy, support him, but he also deals with teasing from characters like Lucy. Charlie Brown’s relatable struggles and enduring spirit make him a timeless symbol of perseverance."
text = nlp(raw)

In [138]:
sentences = list(text.sents)

In [139]:
sentences[0]

Charlie Brown is a beloved character from the comic strip Peanuts, created by Charles Schulz.

In [140]:
# tokenization

# tokens in the first sentence
[a for a in sentences[0]]


[Charlie,
 Brown,
 is,
 a,
 beloved,
 character,
 from,
 the,
 comic,
 strip,
 Peanuts,
 ,,
 created,
 by,
 Charles,
 Schulz,
 .]

In [141]:
sentences[0][10].lemma_

'Peanuts'

In [142]:
x = 'Mr. Peanuts'
x = nlp(x)

x[1].lemma_


'Peanuts'

In [143]:
x

Mr. Peanuts

In [144]:
# just in case you are not familiar with list comprehension

# equivalent for loop

# tokens in all the sentences; list comprehension

In [145]:
# all tokens in the text


In [146]:
# a list of tokens in the first sentence and if they are stop words

In [147]:
# only listing the tokens that are not stop words in sentences[0]

In [148]:
# only listing the tokens that are not stop words or punctuation


In [149]:
# list the lowercased tokens and lemma

In [150]:
# just in case if you are interested in using Porter Stemming

In [151]:
# print each token and its part-of-speech (POS) tags in sentences[0]

In [152]:
# print each noun phrase (NP) and its starting & ending token in sentences[0]

In [153]:
# print each entity, its entity type, and its starting & ending token in sentences[0]
# GPE stands for Geo-Political Entity
# CARDINAL stands for Cardinal Entity

In [154]:
# you can also get the IOB-style entity tags for each token

## In-class Exercise 2
* Counting the most frequent nouns (NN, NNP, NNS, or NNPS) in the provided text.

In [155]:
from collections import Counter

# your solution

# write a list comprehension [] for all tokens' lemma
# use Counter([ your list comprehension ]).most_common()