In [1]:

#import subprocess
#from subprocess import call


import sys # unix commands

from pathlib import Path
import zipfile

In [2]:
import math
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

%matplotlib inline

In [3]:
import urllib.request
import requests

## Download Files

Social Security Name data set
US Social Security Baby Name catalog https://www.ssa.gov/oact/babynames
This catalog records the names given to babies born since the year 1880.The datat can be downloaded through the link https://www.ssa.gov/oact/babynames/limits.html

In [4]:
# define the url
names_url = 'https://www.ssa.gov/oact/babynames/names.zip'

In [6]:
urllib.request.urlretrieve(names_url,'../Data/names.zip') 

('../Data/names.zip', <http.client.HTTPMessage at 0x7f1cfd68be20>)

In [7]:
# create HTTP response object. the url of the content to be downloaded is defined as names_url
# send a HTTP request to the server and save
# the HTTP response in a response object called r

r = requests.get(names_url) 



# we save the received content as a zip file in binary format
# the we write the contents of the response (r.content)
# to a new file on disk in binary mode

with open("../Data/names.zip",'wb') as f:
    f.write(r.content)

### Checking the Content of the Downloads
To confirm that the download has completed correctly, we check the locations where we have specified for the files to be downloaded. This can be done using normal unix commands and is made possible by the `sys` library which has been previously imported

In [8]:
ls ../Data/


[0m[01;34mcredit_card_fraud_detection[0m/  [01;34mmelbourne-housing[0m/  [01;34mnames[0m/  [01;31mnames2.zip[0m  [01;31mnames.zip[0m


### Uncompress the downloaded zip file
Since the downloaded data is contained in a zip file, we will need to uncompress the zip file to gain access to its contents. Here we use the zipfile module to uncompress the zip file.

In [9]:
zipfile.ZipFile('../Data/names.zip').extractall('../Data/names/')

Now let's check to confirm that zip file has been uncompressed and its contents are in the specified director

In [10]:
ls ../Data/names

NationalReadMe.pdf  yob1908.txt  yob1937.txt  yob1966.txt  yob1995.txt
yob1880.txt         yob1909.txt  yob1938.txt  yob1967.txt  yob1996.txt
yob1881.txt         yob1910.txt  yob1939.txt  yob1968.txt  yob1997.txt
yob1882.txt         yob1911.txt  yob1940.txt  yob1969.txt  yob1998.txt
yob1883.txt         yob1912.txt  yob1941.txt  yob1970.txt  yob1999.txt
yob1884.txt         yob1913.txt  yob1942.txt  yob1971.txt  yob2000.txt
yob1885.txt         yob1914.txt  yob1943.txt  yob1972.txt  yob2001.txt
yob1886.txt         yob1915.txt  yob1944.txt  yob1973.txt  yob2002.txt
yob1887.txt         yob1916.txt  yob1945.txt  yob1974.txt  yob2003.txt
yob1888.txt         yob1917.txt  yob1946.txt  yob1975.txt  yob2004.txt
yob1889.txt         yob1918.txt  yob1947.txt  yob1976.txt  yob2005.txt
yob1890.txt         yob1919.txt  yob1948.txt  yob1977.txt  yob2006.txt
yob1891.txt         yob1920.txt  yob1949.txt  yob1978.txt  yob2007.txt
yob1892.txt         yob1921.txt  yob1950.txt  yob1979.txt  yob20

## Reading the Data

### Read Single File

try to read one of the files so that we can preview the data and have see what it looks like

In [11]:

names2020_1 = pd.read_csv('../Data/names/yob2020.txt')
names2020_1.head()

Unnamed: 0,Olivia,F,17641
0,Emma,F,15656
1,Ava,F,13160
2,Charlotte,F,13065
3,Sophia,F,13036
4,Amelia,F,12767


##### Adding a Header Row
From the previous step we see that the data does not contain a header row. We will need to add a header row to make our analysis easier. We do this in the next step by using the `names` parameter of `read_csv` to pass a list of column names while reading the file

In [11]:

names2020_2 = pd.read_csv('../Data/names/yob2020.txt', names=['name','sex','number'])
names2020_2.head()

Unnamed: 0,name,sex,number
0,Olivia,F,17535
1,Emma,F,15581
2,Ava,F,13084
3,Charlotte,F,13003
4,Sophia,F,12976


##### Add a helper column for descriptive purposes
Although we know from the file name that this data is for the year 2020, there is nothing in the data itself to indicate this. We can therefore add a helper column to show this. We use the `assign` method of `read_csv` to do this as follows: 

In [12]:
names2020_3 = pd.read_csv('../Data/names/yob2020.txt', names=['name','sex','number']).assign(year=2020)
names2020_3.head()

Unnamed: 0,name,sex,number,year
0,Olivia,F,17535,2020
1,Emma,F,15581,2020
2,Ava,F,13084,2020
3,Charlotte,F,13003,2020
4,Sophia,F,12976,2020


### Read All Files in the Folder
We have seen how we can read each individual file in the folder into its own dataframe. In practice, when faced with a situation where we need to read several files in a folder into a data frame; using a reading each file individually is not very convenient. We want a way to read all the files into a single data frame, without requiring a `read_csv` statement for each of the files. In effect, we do not want to explicitly specify the name of each individual file, as that will require several lines of code (as many as there are files), just to read the data.

The general idea therefore is as follows:

<ol>
  <li> Create a list that contains the names of all the files in the folder </li>
  <li> Create another list that will contain data frames
  <li> Iterate over the list of file names, reading the content of each file into a data frame and then adding each data frame to the list of data frames </li>
  <li> Merge all data frames in the list of data frames into a single data frams </li>
</ol>

We will do this below:

### Using a for loop

First we specify the location/folder that contains the files that contains the files that we are interested in.

In [14]:
folder = Path('../Data/names/').absolute()
# [f for f in folder.iterdir() iff f.name.startswith('yob') ]

Next, we create a list that contains the paths to each of the files in the directory

In [16]:
list_of_files= []
for f in folder.iterdir():
    if ( f.name.startswith('yob') and f.name.endswith('txt') ):
        list_of_files.append(f)

Next, we create a **list_of_dataframes**; then iterate through the **list_of_files**, reading each file into a dataframe and adding the respective dataframes to the **list_of_dataframes**.

Recall that the data in the files does not have headers, so we use the `names` parameter of `read_csv` to pass a list of column names while reading the file.

`list_of_dfs = []
for name_of_file in list_of_files:
    list_of_dfs.append( pd.read_csv(name_of_file, names=['name','sex','number']) )`
    
The code snippet shown above will successfully read the files into dataframes and append each dataframe to the **list_of_dataframes**; however, as there is no column to tell us what year is represented by each record, we use a helper column to show this. The helper column can be added while reading the data by using the `assign` method of `read_csv`. Additionally, we can add another helper column to indicate the name of the file from which the data was read.

In [26]:
list_of_dataframes = []
for idx,name_of_file in enumerate(list_of_files,1):
    list_of_dataframes.append( pd.read_csv(name_of_file, names=['name','sex','number']).
                       assign(file_num = f'file-{idx}', file_name=name_of_file.name[0::]) )
    
len(list_of_dataframes)

141

#### Concatenate the dataframes into one dataframes

In [27]:
dfs_in_one = pd.concat(list_of_dataframes)
dfs_in_one.head()

Unnamed: 0,name,sex,number,file_num,file_name
0,Emily,F,23944,file-1,yob2005.txt
1,Emma,F,20348,file-1,yob2005.txt
2,Madison,F,19571,file-1,yob2005.txt
3,Abigail,F,15751,file-1,yob2005.txt
4,Olivia,F,15694,file-1,yob2005.txt


In [28]:
# dfs_in_one[dfs_in_one[file_name=='yob2020.txt']]

dfs_in_one.loc[dfs_in_one['file_name']=='yob2020.txt']

Unnamed: 0,name,sex,number,file_num,file_name
0,Olivia,F,17535,file-94,yob2020.txt
1,Emma,F,15581,file-94,yob2020.txt
2,Ava,F,13084,file-94,yob2020.txt
3,Charlotte,F,13003,file-94,yob2020.txt
4,Sophia,F,12976,file-94,yob2020.txt
...,...,...,...,...,...
31266,Zykell,M,5,file-94,yob2020.txt
31267,Zylus,M,5,file-94,yob2020.txt
31268,Zymari,M,5,file-94,yob2020.txt
31269,Zyn,M,5,file-94,yob2020.txt


### Using a list comprehension

Another way to create a list of files in the folder is to use a list comprehension. This is my preferred method, as it is very concise

#### Create list of files using a comprehension


In [20]:
list_of_files = [f for f in folder.iterdir() if f.name.startswith('yob')]

Below, we employ the use of a list comprehension to help in reading each of the files in the **list_of_files** created above, while creating a **list_of_dataframes**. Each dataframe in the **list_of_dataframes** corresponds to a file in the list_of_files

In [21]:

    # the assign() method adds a helper column
list_of_dfs = [
    pd.read_csv(csv_file) for csv_file in (list_of_files)
]

In [22]:
list_of_dfs

[           Emily  F  23948
 0           Emma  F  20349
 1        Madison  F  19572
 2        Abigail  F  15753
 3         Olivia  F  15698
 4       Isabella  F  15192
 ...          ... ..    ...
 32550    Zymiere  M      5
 32551     Zyrell  M      5
 32552     Zyrian  M      5
 32553     Zyshon  M      5
 32554  Zytavious  M      5
 
 [32555 rows x 3 columns],
        Jennifer  F  56779
 0      Michelle  F  33159
 1          Lisa  F  32912
 2      Kimberly  F  30703
 3           Amy  F  26238
 4        Angela  F  25899
 ...         ... ..    ...
 15292    Zawdie  M      5
 15293       Zel  M      5
 15294      Zeno  M      5
 15295     Zenon  M      5
 15296   Zigmond  M      5
 
 [15297 rows x 3 columns],
           Mary  F  17580
 0        Helen  F   7579
 1     Margaret  F   6713
 2         Anna  F   5575
 3         Ruth  F   5573
 4      Dorothy  F   4967
 ...        ... ..    ...
 3942      Wray  M      5
 3943     Wyman  M      5
 3944       Zeb  M      5
 3945      Zeke  M    

As demonstrated previously, the code snippet shown above will successfully read the files into dataframes and append each dataframe to the **list_of_dfs**; however, as there is no column to tell us what year is represented by each record, 

We use a helper column to indicate the year that is represented by each record. The helper column can be added while reading the data by using the `assign` method of `read_csv`. We also add another helper column to indicate the name of the file from which the data was read.

Create a list of files

#### Reading the files

<ol>
  <li> First item </li>
  <li> Second item </li>
  <li> Third item </li>
</ol>

as the files do not have a header row, we will add a header while reading in the file. If this were a single file we would have done: `df = pd.read_csv('../Data/names/yob2011.txt', names=['name','sex','number'])`

In [23]:

    # the assign() method adds a helper column
dfs = [
    pd.read_csv(name_of_file, names=['name','sex','number']).
    assign(file_num = f'file-{idx}', file_name=name_of_file.name[0::])
    for idx, name_of_file in enumerate(list_of_files, 1)
]

In [27]:
dfs

[            name sex  number file_num    file_name
 0          Emily   F   23948   file-1  yob2005.txt
 1           Emma   F   20349   file-1  yob2005.txt
 2        Madison   F   19572   file-1  yob2005.txt
 3        Abigail   F   15753   file-1  yob2005.txt
 4         Olivia   F   15698   file-1  yob2005.txt
 ...          ...  ..     ...      ...          ...
 32551    Zymiere   M       5   file-1  yob2005.txt
 32552     Zyrell   M       5   file-1  yob2005.txt
 32553     Zyrian   M       5   file-1  yob2005.txt
 32554     Zyshon   M       5   file-1  yob2005.txt
 32555  Zytavious   M       5   file-1  yob2005.txt
 
 [32556 rows x 5 columns],
            name sex  number file_num    file_name
 0      Jennifer   F   56779   file-2  yob1971.txt
 1      Michelle   F   33159   file-2  yob1971.txt
 2          Lisa   F   32912   file-2  yob1971.txt
 3      Kimberly   F   30703   file-2  yob1971.txt
 4           Amy   F   26238   file-2  yob1971.txt
 ...         ...  ..     ...      ...    

In [25]:
dfs_in_one = pd.concat(dfs)

In [26]:
dfs_in_one.head()

Unnamed: 0,name,sex,number,file_num,file_name
0,Emily,F,23948,file-1,yob2005.txt
1,Emma,F,20349,file-1,yob2005.txt
2,Madison,F,19572,file-1,yob2005.txt
3,Abigail,F,15753,file-1,yob2005.txt
4,Olivia,F,15698,file-1,yob2005.txt


and here we have a dataframe containing the data from all the csv file in the folder