# Reading Large files and searching for strings within the files

> While reading large files, an efficient way is to read line by line instead of fetching the data all at once.

> use the readline() functions with file handler

In [11]:
import os,sys
import pprint as pp

# Locate current working directory as destination for report
os.chdir("/Users/Riley/Documents/Research/User_Input_Module/")
cwd_path = os.getcwd()
dest_path = cwd_path + "/Data"

# Create Data folder in repo if no data folder
if not os.path.isdir(dest_path) :
    os.mkdir(dest_path)

# target directory with all of the log files in it
target_dir = "/Users/Riley/Documents/Research/User_Input_Module/Gaussian_Outputs_Keywords"

# Check connection to target directory
if os.path.isdir(target_dir) :
    print("Connection established")
else:
    sys.exit("Failed to connect to target directory: %s" % (target_dir))

Connection established


In [12]:
# Gather list of files present in the target directory
# Create list of all the files in a the directory 
listoffiles = list()
for (dirpath, dir_names, filenames) in os.walk(target_dir):
    listoffiles += [os.path.join(dirpath,file) for file in filenames if file.split(".")[len(file.split("."))-1] == "log"]

pp.pprint(listoffiles)

['/Users/Riley/Documents/Research/User_Input_Module/Gaussian_Outputs_Keywords/AllylChlorideClTS.log',
 '/Users/Riley/Documents/Research/User_Input_Module/Gaussian_Outputs_Keywords/Pd2p-ene-CO-transcis-P-fromciscis.log',
 '/Users/Riley/Documents/Research/User_Input_Module/Gaussian_Outputs_Keywords/Cl.log',
 '/Users/Riley/Documents/Research/User_Input_Module/Gaussian_Outputs_Keywords/AllylChloride.log']


In [13]:
# Open file
file_path = '/Users/Riley/Documents/Research/User_Input_Module/Gaussian_Outputs_Keywords/AllylChloride.log'
fileHandler = open(file_path,"r")

while True :
    # get next line from the file
    line = fileHandler.readline()
    # If line is empty then end of file reached
    if not line :
        break;
    print(line.strip())

# Close Close
fileHandler.close()

Entering Gaussian System, Link 0=g16
Input=AllylChloride.gjf
Output=AllylChloride.log
Initial command:
/home1/apps/gaussian/16rA.03/g16/l1.exe "/scratch/08052/tg873513/Gau-53601.inp" -scrdir="/scratch/08052/tg873513/"
Entering Link 1 = /home1/apps/gaussian/16rA.03/g16/l1.exe PID=     53602.

Copyright (c) 1988,1990,1992,1993,1995,1998,2003,2009,2016,
Gaussian, Inc.  All Rights Reserved.

This is part of the Gaussian(R) 16 program.  It is based on
the Gaussian(R) 09 system (copyright 2009, Gaussian, Inc.),
the Gaussian(R) 03 system (copyright 2003, Gaussian, Inc.),
the Gaussian(R) 98 system (copyright 1998, Gaussian, Inc.),
the Gaussian(R) 94 system (copyright 1995, Gaussian, Inc.),
the Gaussian 92(TM) system (copyright 1992, Gaussian, Inc.),
the Gaussian 90(TM) system (copyright 1990, Gaussian, Inc.),
the Gaussian 88(TM) system (copyright 1988, Gaussian, Inc.),
the Gaussian 86(TM) system (copyright 1986, Carnegie Mellon
University), and the Gaussian 82(TM) system (copyright 1983,
Carne

# Read file line by line with context manager (with block)

> When we open the file then we need to close it too
> 
> When code gets bigger, there are higher chances of skipping close() somewhere, so the object still takes up memory or other elusive errors can occur
> 
> 
> If we forget to close it, the file will only be closed automatically when last reference to file handler is destroyed, i.e. at the end of a function
> 
> But what if had a large function that isn't going to end soon, even if file related work is complete
> 
> We can use context manager to automatically cleanup things like file closure etc.

**Key Take-Away:** When control comes out of the *with* block, the file will be automatically closed. Even if it came out of the block because of an exception.

### "with statement" Assumptions

1. To use a file object, we must first open it finally close it
2. THe file, exists at the given path
   - If not, get `FileNotFoundError: [Errno 2] No such file or directory:`

### "with statement" Benefits

1. When the block ends, it will automatically close the file
2. It reduces the number of lines of code
3. IT reduces the probability of a bug

In [14]:
# Open file
with open (file_path, "r") as fileHandler :
    # Read each line in loop
    for line in fileHandler:
        # As each line (except last one) will contain new line character, so strip that
        print(line.strip())

Entering Gaussian System, Link 0=g16
Input=AllylChloride.gjf
Output=AllylChloride.log
Initial command:
/home1/apps/gaussian/16rA.03/g16/l1.exe "/scratch/08052/tg873513/Gau-53601.inp" -scrdir="/scratch/08052/tg873513/"
Entering Link 1 = /home1/apps/gaussian/16rA.03/g16/l1.exe PID=     53602.

Copyright (c) 1988,1990,1992,1993,1995,1998,2003,2009,2016,
Gaussian, Inc.  All Rights Reserved.

This is part of the Gaussian(R) 16 program.  It is based on
the Gaussian(R) 09 system (copyright 2009, Gaussian, Inc.),
the Gaussian(R) 03 system (copyright 2003, Gaussian, Inc.),
the Gaussian(R) 98 system (copyright 1998, Gaussian, Inc.),
the Gaussian(R) 94 system (copyright 1995, Gaussian, Inc.),
the Gaussian 92(TM) system (copyright 1992, Gaussian, Inc.),
the Gaussian 90(TM) system (copyright 1990, Gaussian, Inc.),
the Gaussian 88(TM) system (copyright 1988, Gaussian, Inc.),
the Gaussian 86(TM) system (copyright 1986, Carnegie Mellon
University), and the Gaussian 82(TM) system (copyright 1983,
Carne

# Get a list of lines from the file with context manager (with block)

> We can still iterate over all the lines in a file and create a list of lines

In [15]:
# Get all the lines in the file as a list
listOfLines = list()
with open (file_path,"r") as myFile:
    for line in myFile:
        listOfLines.append(line)

pp.pprint(listOfLines)

[' Entering Gaussian System, Link 0=g16\n',
 ' Input=AllylChloride.gjf\n',
 ' Output=AllylChloride.log\n',
 ' Initial command:\n',
 ' /home1/apps/gaussian/16rA.03/g16/l1.exe '
 '"/scratch/08052/tg873513/Gau-53601.inp" -scrdir="/scratch/08052/tg873513/"\n',
 ' Entering Link 1 = /home1/apps/gaussian/16rA.03/g16/l1.exe PID=     53602.\n',
 '  \n',
 ' Copyright (c) 1988,1990,1992,1993,1995,1998,2003,2009,2016,\n',
 '            Gaussian, Inc.  All Rights Reserved.\n',
 '  \n',
 ' This is part of the Gaussian(R) 16 program.  It is based on\n',
 ' the Gaussian(R) 09 system (copyright 2009, Gaussian, Inc.),\n',
 ' the Gaussian(R) 03 system (copyright 2003, Gaussian, Inc.),\n',
 ' the Gaussian(R) 98 system (copyright 1998, Gaussian, Inc.),\n',
 ' the Gaussian(R) 94 system (copyright 1995, Gaussian, Inc.),\n',
 ' the Gaussian 92(TM) system (copyright 1992, Gaussian, Inc.),\n',
 ' the Gaussian 90(TM) system (copyright 1990, Gaussian, Inc.),\n',
 ' the Gaussian 88(TM) system (copyright 1988, Gaus

# Read the contents of a file line by line using the *with* context manager and while loop

> We can combine the *with* context manager and while loop to iterate over all the lines in a file

In [16]:
# Open file
with open(file_path,"r") as fileHandler :
    # Read the next line
    line = fileHandler.readline()
    # check that line is not empty
    while line :
        print(line.strip())
        line = fileHandler.readline()

Entering Gaussian System, Link 0=g16
Input=AllylChloride.gjf
Output=AllylChloride.log
Initial command:
/home1/apps/gaussian/16rA.03/g16/l1.exe "/scratch/08052/tg873513/Gau-53601.inp" -scrdir="/scratch/08052/tg873513/"
Entering Link 1 = /home1/apps/gaussian/16rA.03/g16/l1.exe PID=     53602.

Copyright (c) 1988,1990,1992,1993,1995,1998,2003,2009,2016,
Gaussian, Inc.  All Rights Reserved.

This is part of the Gaussian(R) 16 program.  It is based on
the Gaussian(R) 09 system (copyright 2009, Gaussian, Inc.),
the Gaussian(R) 03 system (copyright 2003, Gaussian, Inc.),
the Gaussian(R) 98 system (copyright 1998, Gaussian, Inc.),
the Gaussian(R) 94 system (copyright 1995, Gaussian, Inc.),
the Gaussian 92(TM) system (copyright 1992, Gaussian, Inc.),
the Gaussian 90(TM) system (copyright 1990, Gaussian, Inc.),
the Gaussian 88(TM) system (copyright 1988, Gaussian, Inc.),
the Gaussian 86(TM) system (copyright 1986, Carnegie Mellon
University), and the Gaussian 82(TM) system (copyright 1983,
Carne

# Opening Multiple Files in a single with statement

> We can open two files at once using a single with statement
> We will read from `file_path` and write to `outfile_path`

**Key Take-Aways:** Since we used the *with* statement, close() will automatically be called on both file objects when the execution block ends


In [17]:
outfile_path = '/Users/Riley/Documents/Research/User_Input_Module/Gaussian_Outputs_Keywords/SampleFile.txt'

# Read from file_path and write in outfile_path
with open(outfile_path,"w") as outfile, open (file_path, "r") as infile:
    data = infile.read()
    outfile.write(data)
    # Both files closed automatically when the with statement block ends