# Capture and organize data in downloaded files 

We have downloaded some ```.txt``` files in our most recent scrape.

We will read those files using Python and export data to csv. (are you seeing a pattern yet?)

## First we scrape the files from a website

# Import libraries


In [None]:
## Install wget
!pip install wget

In [None]:
# import libraries
from bs4 import BeautifulSoup  ## scrape info from web pages
import requests ## get web pages from server
import time # time is required. we will use its sleep function
from random import randrange # generate random numbers
import wget # can put down documents, files from websites
import pandas as pd ## to export csv file
## in order to export our file to our computer drive, you need this only in Colab:
# from google.colab import files

# Assign url to scrape

In [None]:
# url to scrape
url = "https://sandeepmj.github.io/scrape-example-page/pages.html"

# Turn page into soup

In [None]:
## get url and print but hard to read. will do prettify next
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
print(soup.prettify())

# Get all ```txt``` files

### Find all txt files store in list called ```txt_holder```

In [None]:
txt_holder = soup.find_all("ul", class_="txts")
txt_holder

## Find all the ```a``` tags 

In [None]:
for txt_files in txt_holder:
  txt_file_links = txt_files.find_all("a")

txt_file_links

## What is missing from the URLs?

In [None]:
base_url = "https://sandeepmj.github.io/scrape-example-page/"

## Create a list of the full URLs

Without all the ```html```

In [None]:
all_text_links = [base_url + txt_file_link.get("href") for txt_file_link in txt_file_links]
all_text_links

In [None]:
len(all_text_links)

## Switch to os module lesson

## Download all the ```txt``` documents

In [None]:
import os
from pathlib import Path

## where am i?

In [None]:
pwd

## list what's in the directory

In [None]:
ls

In [None]:
## store downloading files in an output folder
output_path = Path('downloaded_files/') ## the path
output_path.mkdir(exist_ok=True) ## create director if it doesn't already exist

## confirm directory was created

In [None]:
ls

In [None]:
## cd into that folder
os.chdir(output_path)

### confirm we are in the correct directory

In [None]:
pwd

### We list its content to confirm it is empty

In [None]:
ls

## Run the downloader script

In [None]:
link_number = len(all_text_links)
link_count = 1
for link in all_text_links:
  print(f"Downloading link {link_count} of {link_number}")
  link_count += 1
  wget.download(link, "")
  snooze = randrange(3,6)
  print(f"Delaying for {snooze} seconds.")
  time.sleep(snooze)
 

### Confirm it's content

In [None]:
ls

In [None]:
## import the glob library for collecting specific files into a list
import glob 

In [None]:
## let's capture the files in a list
## unlike earlier when when we capture the locations,
## these are the actual files with their contents.
myfiles = sorted(glob.glob('*.txt'))
myfiles

In [None]:
#let's turn each file into readable content
for myfile in myfiles:
  with open(myfile, "r") as textfile:
    print(type(textfile))

## We can interpret this ```<class '_io.TextIOWrapper'>``` to read the actual contents

In [None]:
## let's see what the first line of each file contains

for myfile in myfiles:
  with open(myfile, "r") as textfile:
    print(f"Source: {myfile}")
    first_line = textfile.readline()
    print(f"\n{first_line} \
          \n\n**********")

In [None]:
## let's see what each entire file contains

for myfile in myfiles:
  with open(myfile, "r") as textfile:
    print(f"File number: {myfile}")
    all_text = textfile.read()
    print(f"\n{all_text} \
          \n\n**********")



In [None]:
## let read all the lines and put into a list

## let's see what the first line of file contains

for myfile in myfiles:
  with open(myfile, "r") as textfile:
    print(f"File number: {myfile}")
    lines_list = textfile.readlines()
    print(lines_list)
    

In [None]:
## Now let's place clients and decisions into variables called client and decision

for myfile in myfiles:
  with open(myfile, "r") as textfile:
    print(f"File number: {myfile}")
    lines_list = textfile.readlines()
    client = lines_list[0]
    decision = lines_list[2]
    print(client)
    print(decision)

In [None]:
## let's remove the word client and the extra line
for myfile in myfiles:
  with open(myfile, "r") as textfile:
    print(f"File number: {myfile}")
    lines_list = textfile.readlines()
    client = lines_list[0].replace("Client: ","").replace("\n","")
    decision = lines_list[2]
    print(client)
    print(decision)
    print("\n********************\n")


In [None]:
## We don't want an entire sentence – just what the decision was.
## we just want to know the status of lease in one word renew or terminate

for myfile in myfiles:
  with open(myfile, "r") as textfile:
    print(f"File number: {myfile}")
    lines_list = textfile.readlines()
    client = lines_list[0].replace("Client: ","").replace("\n","")
    decision = lines_list[2]
    if "renew rental" in decision:
      decision = "renew"
    else:
      decision = "terminate"
    print(client)
    print(decision)
    print("\n********************\n")

In [None]:
## We want to store in a list to export as CSV file
decisions = []
for myfile in myfiles:
  with open(myfile, "r") as textfile:
    # print(f"File number: {myfile}")
    lines_list = textfile.readlines()
    client = lines_list[0].replace("Client: ","").replace("\n","")
    decision = lines_list[2]
    if "renew rental" in decision:
      decision = "renew"
    else:
      decision = "terminate"

    decision_dict = {"client": client, "decision": decision, "source": myfile}
    decisions.append(decision_dict)
decisions

### Confirm where we are path-wise

In [None]:
pwd

In [None]:
os.chdir("Dropbox/coding/courses/instructor-practical-python/week_07/")

### Create new results directory (note we come out of the downloaded_files folder first)

In [None]:
result_path = Path('../results/') ## the path
result_path.mkdir(exist_ok=True) ## create director if it doesn't already exist

In [None]:
### cd into our results folder:

In [None]:
os.chdir(result_path)

### Confirm we are in the results folder

In [None]:
pwd

In [None]:
## Export as CSV

## use pandas to write to csv file
## we already imported pandas as pd
filename = "lease_decisions.csv" ## what are file name is
df = pd.DataFrame(decisions) ## we turn our list of dicts into a dataframe which we're call df
df
df.to_csv(filename, encoding='utf-8', index=False) ## export to csv as utf-8 coding (it just has to be this)
print(f"{filename} is in your results folder!") ## a print out that tells us the file is ready