# BMIS-2542: Data Programming with Python 
##### Katz Graduate School of Business, Fall 2019


## Session 3: Working with Files
***

### Handling File Paths

In [3]:
import os # import os module that makes the rest of the commands platform (i.e., OS) independent

In [4]:
# get current working directory
workingDir = os.getcwd()
workingDir

'D:\\py_jupyter'

In [3]:
# making new directories
os.makedirs('Data')

In [4]:
# joining file paths to construct a new file path
dataPath = os.path.join(workingDir, 'Data')
dataPath

'D:\\py_jupyter\\Data'

In [5]:
# let's move the daffodils.txt inside the Data directory
filePath = os.path.join(workingDir, 'Data', 'daffodils.txt')
filePath

'D:\\py_jupyter\\Data\\daffodils.txt'

In [6]:
print('Directory Name:',os.path.dirname(filePath)) # directory name
print('File Name:', os.path.basename(filePath)) # base name

Directory Name: D:\py_jupyter\Data
File Name: daffodils.txt


In [7]:
# move some files inside your Data Folder
# get the list of files in the Data folder
os.listdir(dataPath)

[]

In [8]:
# check whether the file path is valid
os.path.exists(filePath)

False

In [9]:
fakePath = os.path.join(workingDir, 'Data2') # can use the relative path as well: fakePath = 'Data2'
print(fakePath)

# check whether the file path is valid
os.path.exists(fakePath)

D:\py_jupyter\Data2


False

In [10]:
# check whether the given path represents a directory
os.path.isdir(dataPath)

True

In [11]:
# check whether the given path represents a file
os.path.isfile(filePath)

False

### Reading and Writing Text Files
1. Call the `open()` function to return a File object.
2. Call the `read()` or `write()` method on the File object.
3. Close the file by calling the `close()` method on the File object.

#### Reading Files

In [5]:
# open a text file
path = os.path.join('Data', 'daffodils.txt')
print(path)

Data\daffodils.txt


In [6]:
daffodilsFile = open(path) # default mode is 'r' for reading

In [7]:
# use read() to read the entire contents of a file as a string 
daffodillsContent = daffodilsFile.read()
daffodilsFile.close() # close the file

In [8]:
print(daffodillsContent)




In [18]:
daffodilsFile = open(path)

# use the readline() method to read one line at a time
line1 = daffodilsFile.readline()

In [19]:
line1

''

In [20]:
line2 = daffodilsFile.readline()
line2

''

In [21]:
line3 = daffodilsFile.readline()
line3

''

In [22]:
line4 = daffodilsFile.readline()
line4

''

In [23]:
daffodilsFile.close()

In [1]:
daffodilsFile = open(path)

# use the readlines() method to get a list of string values from the file
lines = daffodilsFile.readlines()
daffodilsFile.close()

NameError: name 'path' is not defined

In [25]:
lines

[]

#### Writing to Files

In [26]:
# Start writing the poem "If" by RUDYARD KIPLING to if.txt
poem = open('if.txt', 'w') # opening in mode 'w' for writing - file will be overwritten if already exists
poem.write('If - By RUDYARD KIPLING\n') 
poem.write('*************************************************') 
poem.close()

In [27]:
poem = open('if.txt', 'a') # opening in the mode 'a' for appending
poem.write('\n') 
poem.write('If you can talk with crowds and keep your virtue,\n') 
poem.write('Or walk with Kings—nor lose the common touch,\n')
poem.write('If neither foes nor loving friends can hurt you,\n')
poem.write('If all men count with you, but none too much;\n')
poem.write('If you can fill the unforgiving minute\n')
poem.write('With sixty seconds’ worth of distance run,\n')
poem.write('Yours is the Earth and everything that’s in it,  \n')
poem.write('And—which is more—you’ll be a Man, my son!\n')
poem.close()

In [28]:
poem = open('if.txt')
content = poem.read()
poem.close()
print(content)

If - By RUDYARD KIPLING
*************************************************
If you can talk with crowds and keep your virtue,
Or walk with Kings—nor lose the common touch,
If neither foes nor loving friends can hurt you,
If all men count with you, but none too much;
If you can fill the unforgiving minute
With sixty seconds’ worth of distance run,
Yours is the Earth and everything that’s in it,  
And—which is more—you’ll be a Man, my son!



### Example
We need to write code that performs the following analysis for the file novel.txt ([source](https://www.gutenberg.org/ebooks/19337)) <br>**Make sure that `novel.txt` is in the same directory as this Jupyter Notebook.**
1. Print the first row 
2. Print the first 10 rows
3. What is the total number of rows?
4. How many rows are not empty?
5. How many rows containing the word **Christmas**?
6. Create a list of the words that appear in the file. What is the total number of words? print the first 20 words. 
7. Obtain the frequency of each word
8. Print the 10 most frequently used words
9. Export the frequencies to a file. Each row should contain a word followed by a comma, and the frequency of the word. Words should appear from the most to least frequent.

In [29]:
# (1) Print the First Row
path = 'novel.txt'
novel = open(path, 'r', encoding='utf8') 

line = novel.readline()
print(line)
novel.close()

﻿The Project Gutenberg EBook of A Christmas Carol, by Charles Dickens



In [4]:
# (2) Print the first 10 rows
novel = open('novel.txt', 'r', encoding='utf8')

for i in range(3):
    line = novel.readline()
    print(line)

novel.close()

﻿The Project Gutenberg EBook of A Christmas Carol, by Charles Dickens



This eBook is for the use of anyone anywhere at no cost and with



In [31]:
# (3) Get total number of rows reading one line at a time
novel = open('novel.txt', 'r', encoding='utf8')

line = novel.readline()
i = 0
while line:
    i = i+1
    line = novel.readline()
print(i)
novel.close()

3853


In [32]:
# (3) Get total number of rows by reading all lines at once
f = open('novel.txt', 'r', encoding='utf8')
x = f.readlines() 
print(len(x))
f.close()

print(type(x))

print(x[0])
print(x[1])
print(x[2])

3853
<class 'list'>
﻿The Project Gutenberg EBook of A Christmas Carol, by Charles Dickens



This eBook is for the use of anyone anywhere at no cost and with



In [33]:
# (4) Get the non-empty rows
novel = open('novel.txt', 'r', encoding='utf8')

i = 0
for line in novel:     
    if len(line.strip())>0:        
        i = i+1
print(i)
novel.close()

2979


In [34]:
# (5) How many rows contain the word "Christmas"
novel = open('novel.txt', 'r', encoding='utf8')

i = 0
for line in novel:
    if "Christmas" in line:
        i = i+1
print(i)
novel.close()

90


In [35]:
# (6) Obtain the list of words in the file and print the first twenty
novel = open('novel.txt', 'r', encoding='utf8')

words = [] #list of words
for line in novel:
    words = words + line.split() 
print(len(words))
print(words[:20])
novel.close()

32326
['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'A', 'Christmas', 'Carol,', 'by', 'Charles', 'Dickens', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere']


In [None]:
# (7) Get frequency of each word
# map() function returns a list of the results after applying the given function to each item of a given iterable (list, tuple etc.)
# lambda notation is for an anonymous function
words_lower=list(map(lambda x:x.lower(),words)) 
words_lower[:20]

freq = dict()
for w in words_lower:
    if w in freq:
        freq[w]+=1
    else:
        freq[w]=1
print(freq)

In [None]:
# (8) Get the 10 most frequenty used words
freqList = [(freq[d],d) for d in freq.keys()]
freqList[:10]

freqList.sort()
freqList[:10]

freqList.sort(reverse = True)
freqList[:10]

In [None]:
#(9) export frequencies to file
output=open('word_freq.csv','w', encoding='utf8')
for freq,word in freqList:
    output.write(word + ',' + str(freq) + '\n')     
output.close()