In [1]:
# f string
books = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting in water alone', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]

In [2]:
for book in books:
  print(f"Author is {book[0]}")

Author is Author
Author is Twain
Author is Feynman
Author is Hamilton


In [3]:
for author, topic, pages in books:
  print(f"Author is {author}")

Author is Author
Author is Twain
Author is Feynman
Author is Hamilton


In [4]:
# you might see there is a padding issue that we have to fix
for author, topic, pages in books:
  print(f"{author} {topic} {pages}")

Author Topic Pages
Twain Rafting in water alone 601
Feynman Physics 95
Hamilton Mythology 144


In [5]:
# there is a solution . we can add a padding to the columns with this way , but still it's not right it's because our pages column is integer 
for author, topic, pages in books:
  print(f"{author:{10}} {topic:{40}} {pages:{10}}")

Author     Topic                                    Pages     
Twain      Rafting in water alone                          601
Feynman    Physics                                          95
Hamilton   Mythology                                       144


In [6]:
# we can fix the problem with adding > symbol
for author, topic, pages in books:
  print(f"{author:{10}} {topic:{40}} {pages:>{10}}")

Author     Topic                                         Pages
Twain      Rafting in water alone                          601
Feynman    Physics                                          95
Hamilton   Mythology                                       144


In [7]:
from datetime import datetime
time = datetime(year=2022, month=7, day=5)

In [8]:
# time references cheat sheet :  https://strftime.org/
print(f"Today : {time:%D}")

Today : 07/05/22


In [9]:
# creating a text file
%%writefile test.txt
Hello, this is the quick test file.
This is the second line of my test file.

Writing test.txt


In [10]:
myfile = open('test.txt')

In [11]:
myfile.read()

'Hello, this is the quick test file.\nThis is the second line of my test file.'

In [12]:
# you can not call .read() multiple times because when you read the file cursor goes to end og the file
myfile.read()

''

In [13]:
# to change the cursor's point to the start point
myfile.seek(0)

0

In [14]:
myfile.read()

'Hello, this is the quick test file.\nThis is the second line of my test file.'

In [15]:
myfile.seek(0)
content = myfile.read()

In [16]:
content

'Hello, this is the quick test file.\nThis is the second line of my test file.'

In [17]:
print(content)

Hello, this is the quick test file.
This is the second line of my test file.


In [18]:
# we shoul ALWAYS close the file once we're done working with it
myfile.close()

In [19]:
myfile = open('test.txt')

In [20]:
# to read lines as a seperate items in the list
my_lines = myfile.readlines()

In [21]:
for lines in my_lines:
  print(lines.split()[0])

Hello,
This


In [22]:
myfile.close()

In [23]:
# how to write to a file
"""
'r'	open for reading (default)
'w'	open for writing, truncating the file first
'x'	create a new file and open it for writing
'a'	open for writing, appending to the end of the file if it exists
'b'	binary mode
't'	text mode (default)
'+'	open a disk file for updating (reading and writing)
'U'	universal newline mode (deprecated)
"""
# w+ allows to both read and write to the file
myfile = open('test.txt', 'w+')

In [24]:
# it's empty . it's because we open file to both read and WRITE and it's overwrite :/
myfile.read()

''

In [25]:
myfile.write('My brand new text.')

18

In [26]:
myfile.seek(0)

0

In [27]:
myfile.read()

'My brand new text.'

In [28]:
myfile.close()

In [29]:
# how to append to the file also if we open the files with the mode a+ we don't have to create the file before (but we cannot use default mode to create a file)
myfile = open('test.txt', 'a+')

In [30]:
# with a+ we dont have to worry about the overwriting . It's just gonna append new lines to our test file
myfile.write('Another new text to the file')

28

In [31]:
myfile.seek(0)
myfile.read()

'My brand new text.Another new text to the file'

In [32]:
# how to automatically close the file because .close is unsafe method we can forget to use it
# context manager

with open('test.txt', 'r') as newfile:
  context = newfile.readlines()

In [33]:
context

['My brand new text.Another new text to the file']

-> Working with PDF files.

Note that : not all pdfs have the text that can be extracted

In [34]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading PyPDF2-2.4.2-py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 5.0 MB/s 
Installing collected packages: PyPDF2
Successfully installed PyPDF2-2.4.2


In [35]:
import PyPDF2 as pdf

In [36]:
# rb mode is short for read binary
myfile = open('US_Declaration.pdf', 'rb')

In [37]:
pdf_reader = pdf.PdfFileReader(myfile)

In [38]:
pdf_reader.numPages

5

In [39]:
page1 = pdf_reader.getPage(0)
page1.extract_text()

" Declaration of Independence\nIN CONGRESS, July 4, 1776.  \nThe unanimous Declaration of the thirteen united States of America, \nWhen in the Course of human events, it becomes necessary for one people to dissolve the\npolitical bands which have connected them with another, and to assume among the powers of the\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\nwhich impel them to the separation. \nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\nof Happiness.— \x14That to secure these rights, Governments are instituted among Men, deriving\ntheir just powers from the consent of the governed,—  \x14That whenever any Form of Government\nbecomes destructive of these ends, it is the Right of the People t

In [40]:
print(page1.extract_text())

 Declaration of Independence
IN CONGRESS, July 4, 1776.  
The unanimous Declaration of the thirteen united States of America, 
When in the Course of human events, it becomes necessary for one people to dissolve the
political bands which have connected them with another, and to assume among the powers of the
earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle
them, a decent respect to the opinions of mankind requires that they should declare the causes
which impel them to the separation. 
We hold these truths to be self-evident, that all men are created equal, that they are endowed by
their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit
of Happiness.— That to secure these rights, Governments are instituted among Men, deriving
their just powers from the consent of the governed,—  That whenever any Form of Government
becomes destructive of these ends, it is the Right of the People to alter or to aboli

In [41]:
mytext = page1.extract_text()

In [42]:
myfile.close()

In [43]:
# append a new pdf page to the original file
f = open('US_Declaration.pdf', 'rb')

In [44]:
pdf_reader = pdf.PdfFileReader(f)

In [45]:
page1 = pdf_reader.getPage(0)

In [46]:
pdf_writer = pdf.PdfFileWriter()

In [47]:
pdf_writer.addPage(page1)

In [48]:
pdf_output = open('new_file.pdf', 'wb')

In [49]:
pdf_writer.write(pdf_output)

In [50]:
pdf_output.close()

In [51]:
f.close()

In [52]:
copied_page = open('new_file.pdf', 'rb')
pdf_reader = pdf.PdfFileReader(copied_page)

In [53]:
pdf_reader.getNumPages()

1

In [54]:
page1 = pdf_reader.getPage(0)
print(page1.extract_text())

 Declaration of Independence
IN CONGRESS, July 4, 1776.  
The unanimous Declaration of the thirteen united States of America, 
When in the Course of human events, it becomes necessary for one people to dissolve the
political bands which have connected them with another, and to assume among the powers of the
earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle
them, a decent respect to the opinions of mankind requires that they should declare the causes
which impel them to the separation. 
We hold these truths to be self-evident, that all men are created equal, that they are endowed by
their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit
of Happiness.— That to secure these rights, Governments are instituted among Men, deriving
their just powers from the consent of the governed,—  That whenever any Form of Government
becomes destructive of these ends, it is the Right of the People to alter or to aboli

Regular Expressions allow for pattern searching in a text document.

\d is for digits for example r'\d{3}-\d{2}' can be -> 789-52

In [55]:
text = "The phone number of the agent is 507-23-4566. Call soon!"

In [56]:
# 507-23-4566 in this case we know the actual phone number so we can search for it
"507-23-4566" in text

True

In [57]:
# but what if we don't know
import re

In [60]:
pattern = r'\d{3}-\d{2}-\d{4}'
re.search(pattern, text)

<re.Match object; span=(33, 44), match='507-23-4566'>

In [61]:
match = re.search(pattern, text)
match.span()

(33, 44)

In [62]:
match.group()

'507-23-4566'

In [63]:
pattern = r'(\d{3})-(\d{2})-(\d{4})'
match = re.search(pattern, text)

In [65]:
# with the parantheses we can call the patterns into parantheses individually
match.group(1)

'507'

In [67]:
re.findall(r'.at', "The cat in the hat sat splat.") # notice, we got lat from splat it's because we used only one dot(dot specifies to the chracter(s) before the pattern that we're looking for)

['cat', 'hat', 'sat', 'lat']