In [1]:
# Data download

import os
import requests

repo_owner = "primula1323"
repo_name = "forked_Bigdata_Analyst_Cert"
branch = "main" 
folders = ["part1", "part2", "part3", "part4"]
base_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents"

def download_folder(folder_name, save_dir):
    url = f"{base_url}/{folder_name}?ref={branch}"
    response = requests.get(url)
    response.raise_for_status()
    items = response.json()

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for item in items:
        if item["type"] == "file" and item["name"].endswith(".csv"):
            download_file(item["download_url"], os.path.join(save_dir, item["name"]))
        elif item["type"] == "dir":
            download_folder(item["path"], os.path.join(save_dir, item["name"]))


def download_file(file_url, save_path):
    response = requests.get(file_url)
    response.raise_for_status()
    with open(save_path, "wb") as f:
        f.write(response.content)

for folder in folders:
    print(f"Downloading folder: {folder}")
    download_folder(folder, os.path.join(repo_name, folder))
    print('Download complete')

Downloading folder: part1
Download complete
Downloading folder: part2
Download complete
Downloading folder: part3
Download complete
Downloading folder: part4
Download complete


# Basic Python

In [8]:
# packages

import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns # not used in Bigdata Engeeneer
import scipy
import statsmodels.api as sm

* 0            asn1crypto           0.24.0
* 1        beautifulsoup4            4.9.3
* 2               certifi        2018.1.18
* 3               chardet            3.0.4
* 4          cryptography            2.1.4
* 5                cycler           0.10.0
* 6                cython          0.29.24
* 7               distlib            0.3.2
* 8                  idna              2.6
* 9                joblib            1.0.1
* 10              keyring           10.6.0
* 11         keyrings.alt              3.0
* 12           kiwisolver            1.3.1
* 13             `lightgbm`            3.3.2
* 14           matplotlib            3.4.2
* 15                `numpy`           1.21.1
* 16            packaging             23.0
* 17               `pandas`            1.4.2
* 18                patsy            0.5.3
* 19               pillow            8.3.1
* 20                  pip           21.1.3
* 21             pycrypto            2.6.1
* 22            pygobject           3.26.1
* 23            pyparsing            2.4.7
* 24           python-apt  1.6.5+ubuntu0.6
* 25      python-dateutil            2.8.2
* 26                 pytz           2021.1
* 27                pyxdg             0.25
* 28             requests           2.18.4
* 29         `scikit-learn`           0.24.2
* 30                `scipy`            1.7.0
* 31        secretstorage            2.3.1
* 32             selenium          3.141.0
* 33           setuptools           57.4.0
* 34                  six           1.11.0
* 35            soupsieve            2.2.1
* 36        ssh-import-id              5.7
* 37          `statsmodels`           0.13.5
* 38        threadpoolctl            2.2.0
* 39  unattended-upgrades              0.1
* 40              urllib3             1.22
* 41                wheel           0.30.0
* 42              `xgboost`            1.4.2

In [2]:
#help(function) to read menual

help(np.array)

Help on built-in function array in module numpy:

array(...)
    array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0,
          like=None)

    Create an array.

    Parameters
    ----------
    object : array_like
        An array, any object exposing the array interface, an object whose
        ``__array__`` method returns an array, or any (nested) sequence.
        If object is a scalar, a 0-dimensional array containing object is
        returned.
    dtype : data-type, optional
        The desired data-type for the array. If not given, NumPy will try to use
        a default ``dtype`` that can represent the values (by applying promotion
        rules when necessary.)
    copy : bool, optional
        If true (default), then the object is copied.  Otherwise, a copy will
        only be made if ``__array__`` returns a copy, if obj is a nested
        sequence, or if a copy is needed to satisfy any of the other
        requirements (``dtype``, ``order``, etc.).
  

# Data Structures

## Pythonic Data Container

In [None]:
[x-1 for x in range(1,6) if x%2 == 1]

[0, 2, 4]

In [None]:
{x-1 for x in range(1,6) if x%2 == 1}

{0, 2, 4}

In [None]:
{x+10 : x-1 for x in range(1,6) if x%2 == 0}

{12: 1, 14: 3}

## lambda Operation

## Iteration

In [24]:
lst = [1, 2, 3, 4, 5, 6]
egv = {}

In [25]:
for i, item in enumerate(lst) :
    egv[i] = item
    print(i, item)

0 1
1 2
2 3
3 4
4 5
5 6


In [27]:
for k, v in zip(egv.keys(), egv.values()) :
    print(k, v)

0 1
1 2
2 3
3 4
4 5
5 6


# Series and Dataframes : pandas

## Series

In [14]:
col1 = pd.Series(['a', 'b', 'c'])
col2 = pd.Series([1, 2, 3])
col3 = pd.Series([4, 5, 6])

## Dataframes

In [5]:
#리스트 기반
egdF1 = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['a', 'b', 'c'])
egdF1

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [6]:
#딕셔너리 기반
egdF2 = pd.DataFrame({'a' : [1,2,3], 'b' : [4,5,6], 'c':[7,8,9]})
egdF2

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [15]:
#시리즈 기반
egdF3 = pd.DataFrame({'a' : col1, 'b' : col2, 'c' : col3})
egdF3

Unnamed: 0,a,b,c
0,a,1,4
1,b,2,5
2,c,3,6


* `df['col']`은 pd.Series를 반환한다.
* `df[['col']]`은  pd.DataFrame을 반환한다.

# Functions and Classes

## Functions

In [None]:
def myfunc(x:int) -> None :
    return x + 3

## Classes

# Data import

## txt

In [2]:
f = open('Data/textread_example.txt', 'r')
while True :
    line = f.readline()
    if not line : break
    print(line)
f.close()

DO VOTERS AFFECT OR ELECT POLICIES? EVIDENCE FROM THE U. S. HOUSE



David S. Lee

Enrico Moretti

Matthew J. Butler


## csv

In [9]:
pd.read_csv('forked_Bigdata_Analyst_Cert/part4/ch4/data4-3.csv', parse_dates = ['date_added'])

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries"
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
3,s4,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV"
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ..."
...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158 min,"Cult Movies, Dramas, Thrillers"
8803,s8804,TV Show,Zombie Dumb,,,,2019-07-01,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies"
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88 min,"Comedies, Horror Movies"
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88 min,"Children & Family Movies, Comedies"


In [10]:
pd.read_csv('forked_Bigdata_Analyst_Cert/part4/ch4/data4-3.csv', index_col = 'title', parse_dates = ['date_added'])

Unnamed: 0_level_0,show_id,type,director,cast,country,date_added,release_year,rating,duration,listed_in
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Dick Johnson Is Dead,s1,Movie,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries
Blood & Water,s2,TV Show,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries"
Ganglands,s3,TV Show,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
Jailbirds New Orleans,s4,TV Show,,,,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV"
Kota Factory,s5,TV Show,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ..."
...,...,...,...,...,...,...,...,...,...,...
Zodiac,s8803,Movie,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158 min,"Cult Movies, Dramas, Thrillers"
Zombie Dumb,s8804,TV Show,,,,2019-07-01,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies"
Zombieland,s8805,Movie,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88 min,"Comedies, Horror Movies"
Zoom,s8806,Movie,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88 min,"Children & Family Movies, Comedies"


# Data Export

In [16]:
egdF3.to_csv('result.csv')
pd.read_csv('result.csv')

Unnamed: 0.1,Unnamed: 0,a,b,c
0,0,a,1,4
1,1,b,2,5
2,2,c,3,6


In [17]:
egdF3.to_csv('result.csv', index = False) # to erase 'Unnamed: 0'
pd.read_csv('result.csv')

Unnamed: 0,a,b,c
0,a,1,4
1,b,2,5
2,c,3,6
