# Reading Files With Pandas
This session i will try to demonstrate reading file from spreadsheet or database into Pandas DataFrame

In [1]:
# Import library
import pandas as pd
import numpy as np


# Reading CSV with Pandas

in pandas, reading file csv can be done with pd.read_csv() function

In [2]:
# Filepath variable
filepath = 'data/csv/btc-eth-prices.csv'



In [3]:
# assign to df variable
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,Timestamp,Bitcoin,Ether
0,2017-04-02,1099.169125,48.55
1,2017-04-03,1141.813,44.13
2,2017-04-04,1141.600363,44.43
3,2017-04-05,1133.079314,44.9
4,2017-04-06,1196.307937,43.23


In [4]:
# Check null value
df.isnull().sum()

Timestamp    0
Bitcoin      0
Ether        3
dtype: int64

In [5]:
# See null value
df[df['Ether'].isnull()]

Unnamed: 0,Timestamp,Bitcoin,Ether
250,2017-12-08,16007.436667,
251,2017-12-09,15142.834152,
252,2017-12-10,14869.805,


In [6]:
# check basic stats before fill null value
df['Ether'].describe() 

count      362.000000
mean       506.203757
std       1112.117333
min         42.310000
25%        241.945000
50%        309.730000
75%        683.777500
max      20089.420000
Name: Ether, dtype: float64

In [7]:
# fill null value with ffill method (top nearest value)
df[df['Ether'].isnull()] = df.fillna(method='ffill')

In [8]:
# check data after fill null value
df.iloc[249 :254]

Unnamed: 0,Timestamp,Bitcoin,Ether
249,2017-12-07,16501.971667,421.15
250,2017-12-08,16007.436667,421.15
251,2017-12-09,15142.834152,421.15
252,2017-12-10,14869.805,421.15
253,2017-12-11,16762.116667,513.29


In [9]:
# check basic stats after fill null value
df['Ether'].describe()

count      365.000000
mean       505.504685
std       1107.551645
min         42.310000
25%        244.960000
50%        311.260000
75%        682.300000
max      20089.420000
Name: Ether, dtype: float64

In [10]:
# Saving output file
out_filepath = 'data out/csv/btc-eth-prices.csv'

df.to_csv(out_filepath, index=False)

In [11]:
# Read csv file with separator/delimiter, and new header
col_names = ['color', 'director_name', 'num_critic_for_reviews', 'duration',
                'gross', 'movie_title', 'num_user_for_reviews', 'country',
                'content_rating', 'budget', 'title_year', 'imdb_score', 'genre']
movie_df = pd.read_csv('data/csv/movies.csv', sep='|', header=None, names=col_names, na_values='?', index_col='movie_title', thousands=',')
movie_df.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,gross,num_user_for_reviews,country,content_rating,budget,title_year,imdb_score,genre
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Avatar,Color,James Cameron,723.0,178.0,760505847.0,3054.0,USA,PG-13,237000000.0,2009.0,7.9,Action
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,309404152.0,1238.0,USA,PG-13,300000000.0,2007.0,7.1,Action
Spectre,Color,Sam Mendes,602.0,148.0,200074175.0,994.0,UK,PG-13,245000000.0,2015.0,6.8,Action
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,448130642.0,2701.0,USA,PG-13,250000000.0,2012.0,8.5,Action
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,,,,,,,7.1,Documentary


In [12]:
movie_df.isnull().sum()

color                     1
director_name             0
num_critic_for_reviews    1
duration                  1
gross                     3
num_user_for_reviews      1
country                   1
content_rating            2
budget                    4
title_year                1
imdb_score                0
genre                     0
dtype: int64

In [13]:
movie_df[movie_df['color'].isnull()] = movie_df.fillna(value={'color':'Color'})
movie_df.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,gross,num_user_for_reviews,country,content_rating,budget,title_year,imdb_score,genre
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Avatar,Color,James Cameron,723.0,178.0,760505847.0,3054.0,USA,PG-13,237000000.0,2009.0,7.9,Action
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,309404152.0,1238.0,USA,PG-13,300000000.0,2007.0,7.1,Action
Spectre,Color,Sam Mendes,602.0,148.0,200074175.0,994.0,UK,PG-13,245000000.0,2015.0,6.8,Action
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,448130642.0,2701.0,USA,PG-13,250000000.0,2012.0,8.5,Action
Star Wars: Episode VII - The Force Awakens,Color,Doug Walker,,,,,,,,,7.1,Documentary


In [14]:
movie_df = movie_df.fillna(value={'gross':movie_df['gross'].mean()})

In [15]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, Avatar  to The Hobbit: An Unexpected Journey 
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   color                   100 non-null    object 
 1   director_name           100 non-null    object 
 2   num_critic_for_reviews  99 non-null     float64
 3   duration                99 non-null     float64
 4   gross                   100 non-null    float64
 5   num_user_for_reviews    99 non-null     float64
 6   country                 99 non-null     object 
 7   content_rating          98 non-null     object 
 8   budget                  96 non-null     float64
 9   title_year              99 non-null     float64
 10  imdb_score              100 non-null    float64
 11  genre                   100 non-null    object 
dtypes: float64(7), object(5)
memory usage: 7.8+ KB


In [16]:
movie_df = movie_df.fillna(value={'budget':movie_df['budget'].mean()})

In [17]:
movie_df[movie_df['country'].isnull()] = movie_df.fillna(value={'country':movie_df['country'].mode()})
movie_df[movie_df['content_rating'].isnull()] = movie_df.fillna(value={'content_rating':movie_df['content_rating'].mode()})

In [18]:
movie_df = movie_df.fillna(value={'content_rating':movie_df['content_rating'].mode()})

In [19]:
movie_df.isnull().sum()

color                     0
director_name             0
num_critic_for_reviews    1
duration                  1
gross                     0
num_user_for_reviews      1
country                   1
content_rating            2
budget                    0
title_year                1
imdb_score                0
genre                     0
dtype: int64

In [20]:
movie_df = movie_df.fillna(value={'num_user_for_reviews':movie_df['num_user_for_reviews'].max()})

In [21]:
movie_df = movie_df.fillna(value={'num_critic_for_reviews':movie_df['num_critic_for_reviews'].max()})

In [22]:
movie_df = movie_df.fillna(value={'duration':movie_df['duration'].mean()})

In [23]:
movie_df = movie_df.fillna(value={'title_year':movie_df['title_year'].mode()})

In [24]:
movie_df['content_rating'] = movie_df['content_rating'].fillna('PG-13')

In [25]:
movie_df['title_year'] = movie_df['title_year'].fillna(2013)

In [26]:
movie_df['country'] = movie_df['country'].fillna('USA')

In [27]:
movie_df.isnull().sum()

color                     0
director_name             0
num_critic_for_reviews    0
duration                  0
gross                     0
num_user_for_reviews      0
country                   0
content_rating            0
budget                    0
title_year                0
imdb_score                0
genre                     0
dtype: int64

In [28]:
movie_df.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,gross,num_user_for_reviews,country,content_rating,budget,title_year,imdb_score,genre
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Avatar,Color,James Cameron,723.0,178.0,760505800.0,3054.0,USA,PG-13,237000000.0,2009.0,7.9,Action
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,309404200.0,1238.0,USA,PG-13,300000000.0,2007.0,7.1,Action
Spectre,Color,Sam Mendes,602.0,148.0,200074200.0,994.0,UK,PG-13,245000000.0,2015.0,6.8,Action
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,448130600.0,2701.0,USA,PG-13,250000000.0,2012.0,8.5,Action
Star Wars: Episode VII - The Force Awakens,Color,Doug Walker,813.0,131.535354,238347700.0,4667.0,USA,PG-13,195996875.0,2013.0,7.1,Documentary


In [29]:
# Saving output file
movie_df.to_csv('data out/csv/movies.csv')

In [30]:
# reading tsv (tab separated value) with pd.read_csv
tsv = 'data/tsv/simpsons-episodes.tsv'
col_names = ['Title', 'Air date', 'Production code', 'Season', 'Number in season',
             'Number in series', 'US viewers (million)', 'Views', 'IMDB rating']

simpsons_df = pd.read_csv(
    tsv, sep='\t', names=col_names, index_col='Production code', na_values=['no_val'], parse_dates=['Air date'],skiprows=4)
simpsons_df.head()

Unnamed: 0_level_0,Title,Air date,Season,Number in season,Number in series,US viewers (million),Views,IMDB rating
Production code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7F01,Two Cars in Every Garage and Three Eyes on Eve...,1990-01-11,2.0,4,17,26.1,64959.0,8.1
7F08,,1990-11-15,2.0,6,19,25.4,50691.0,8.0
7F06,Bart the Daredevil,1990-06-12,2.0,8,21,26.2,57605.0,
,Bart Gets Hit by a Car,1991-10-01,2.0,10,23,24.8,56486.0,7.8
7F13,Homer vs. Lisa and the 8th Commandment,1991-07-02,2.0,13,26,26.2,58277.0,8.0


In [31]:
# Saving output file
simpsons_df.to_csv('data out/csv/simpsons.csv')

## Reading XSLX with Pandas

with read_excel method, reading excel file relatively easy because usually excel file already organized

In [32]:
pathfile = 'data/xlsx/playstore.xlsx'

playstore_df = pd.read_excel(pathfile,
                   parse_dates=['Last_Updated'],
                   usecols=['App', 'Rating', 'Installs', 'Rating', 'Genres', 'Last_Updated'])

playstore_df.head()

Unnamed: 0,App,Rating,Installs,Genres,Last_Updated
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,"10,000+",Art & Design,2018-01-07
1,Coloring book moana,3.9,"500,000+",Art & Design;Pretend Play,2018-01-15
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,"5,000,000+",Art & Design,2018-08-01
3,Sketch - Draw & Paint,4.5,"50,000,000+",Art & Design,2018-06-08
4,Pixel Draw - Number Art Coloring Book,4.3,"100,000+",Art & Design;Creativity,2018-06-20


In [33]:
# Saving output file
playstore_df.to_excel('data out/xlsx/playstore.xlsx')

# Reading JSON file with Pandas

Read json file can be done with read_json() function and to write it with to_json() function

In [34]:
# import library
import json

In [35]:
filepath = 'data/json/games.json'

games_df = pd.read_json(filepath)
games_df.head()

Unnamed: 0,title,price,content_rating_img,release_date,provider,genre,image
0,Call of Duty®: WWII + Destiny 2 - Lote,"129,99 €",https://cdn-a.sonyentertainmentnetwork.com/grc...,Dic 21 2018,Activision Blizzard Int'l BV,Shooter,https://store.playstation.com/store/api/chihir...
1,God of War® Digital Deluxe Edition,"69,99 €",https://cdn-a.sonyentertainmentnetwork.com/grc...,Abr 20 2018,Sony Interactive Entertainment Europe,Acción,https://store.playstation.com/store/api/chihir...
2,Far Cry 5,"69,99 €",https://cdn-a.sonyentertainmentnetwork.com/grc...,Mar 27 2018,UBISOFT ENTERTAINMENT SA,Aventura,https://store.playstation.com/store/api/chihir...
3,Far Cry 5 Edición Deluxe,"79,99 €",https://cdn-a.sonyentertainmentnetwork.com/grc...,Mar 27 2018,UBISOFT ENTERTAINMENT SA,Aventura,https://store.playstation.com/store/api/chihir...
4,Far Cry 5 Edición Oro,"89,99 €",https://cdn-a.sonyentertainmentnetwork.com/grc...,Mar 27 2018,UBISOFT ENTERTAINMENT SA,Aventura,https://store.playstation.com/store/api/chihir...


In [36]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   title               193 non-null    object
 1   price               193 non-null    object
 2   content_rating_img  193 non-null    object
 3   release_date        193 non-null    object
 4   provider            193 non-null    object
 5   genre               189 non-null    object
 6   image               193 non-null    object
dtypes: object(7)
memory usage: 5.3+ KB


In [37]:
# not a valid json data
filepath = 'data/json/users.json'

user_df = pd.read_json(filepath)
user_df.head()

Unnamed: 0,info
0,"{'id': 1, 'name': 'Leanne Graham', 'username':..."
1,"{'id': 2, 'name': 'Ervin Howell', 'username': ..."
2,"{'id': 3, 'name': 'Clementine Bauch', 'usernam..."
3,"{'id': 4, 'name': 'Patricia Lebsack', 'usernam..."
4,"{'id': 5, 'name': 'Chelsey Dietrich', 'usernam..."


In [38]:
# read json file without pandas (with python library only)
with open(filepath) as file:
    json_dict = json.load(file)

In [39]:
json_dict # dictionaries data types

{'info': [{'id': 1,
   'name': 'Leanne Graham',
   'username': 'Bret',
   'email': 'Sincere@april.biz',
   'address': [{'street': 'Kulas Light',
     'suite': 'Apt. 556',
     'city': 'Gwenborough',
     'zipcode': '92998-3874',
     'geo': {'lat': '-37.3159', 'lng': '81.1496'}}],
   'phone': '1-770-736-8031 x56442',
   'website': 'hildegard.org',
   'company': {'name': 'Romaguera-Crona',
    'catchPhrase': 'Multi-layered client-server neural-net',
    'bs': 'harness real-time e-markets'}},
  {'id': 2,
   'name': 'Ervin Howell',
   'username': 'Antonette',
   'email': 'Shanna@melissa.tv',
   'address': [{'street': 'Victor Plains',
     'suite': 'Suite 879',
     'city': 'Wisokyburgh',
     'zipcode': '90566-7771',
     'geo': {'lat': '-43.9509', 'lng': '-34.4618'}}],
   'phone': '010-692-6593 x09125',
   'website': 'anastasia.net',
   'company': {'name': 'Deckow-Crist',
    'catchPhrase': 'Proactive didactic contingency',
    'bs': 'synergize scalable supply-chains'}},
  {'id': 3,
   '

In [40]:
# still not valid
user_df = pd.DataFrame.from_dict(json_dict)
user_df.head()

Unnamed: 0,info
0,"{'id': 1, 'name': 'Leanne Graham', 'username':..."
1,"{'id': 2, 'name': 'Ervin Howell', 'username': ..."
2,"{'id': 3, 'name': 'Clementine Bauch', 'usernam..."
3,"{'id': 4, 'name': 'Patricia Lebsack', 'usernam..."
4,"{'id': 5, 'name': 'Chelsey Dietrich', 'usernam..."


In [41]:
# valid json file but in company column still not normalize
user_df = pd.DataFrame.from_dict(json_dict['info'])
user_df.head()

Unnamed: 0,id,name,username,email,address,phone,website,company
0,1,Leanne Graham,Bret,Sincere@april.biz,"[{'street': 'Kulas Light', 'suite': 'Apt. 556'...",1-770-736-8031 x56442,hildegard.org,"{'name': 'Romaguera-Crona', 'catchPhrase': 'Mu..."
1,2,Ervin Howell,Antonette,Shanna@melissa.tv,"[{'street': 'Victor Plains', 'suite': 'Suite 8...",010-692-6593 x09125,anastasia.net,"{'name': 'Deckow-Crist', 'catchPhrase': 'Proac..."
2,3,Clementine Bauch,Samantha,Nathan@yesenia.net,"[{'street': 'Douglas Extension', 'suite': 'Sui...",1-463-123-4447,ramiro.info,"{'name': 'Romaguera-Jacobson', 'catchPhrase': ..."
3,4,Patricia Lebsack,Karianne,Julianne.OConner@kory.org,"[{'street': 'Hoeger Mall', 'suite': 'Apt. 692'...",493-170-9623 x156,kale.biz,"{'name': 'Robel-Corkery', 'catchPhrase': 'Mult..."
4,5,Chelsey Dietrich,Kamren,Lucio_Hettinger@annie.ca,"[{'street': 'Skiles Walks', 'suite': 'Suite 35...",(254)954-1289,demarco.info,"{'name': 'Keebler LLC', 'catchPhrase': 'User-c..."


In [42]:
# import normalize library
from pandas.io.json import json_normalize

user_df = json_normalize(json_dict['info'])
user_df.head() # now company column is separated

Unnamed: 0,id,name,username,email,address,phone,website,company.name,company.catchPhrase,company.bs
0,1,Leanne Graham,Bret,Sincere@april.biz,"[{'street': 'Kulas Light', 'suite': 'Apt. 556'...",1-770-736-8031 x56442,hildegard.org,Romaguera-Crona,Multi-layered client-server neural-net,harness real-time e-markets
1,2,Ervin Howell,Antonette,Shanna@melissa.tv,"[{'street': 'Victor Plains', 'suite': 'Suite 8...",010-692-6593 x09125,anastasia.net,Deckow-Crist,Proactive didactic contingency,synergize scalable supply-chains
2,3,Clementine Bauch,Samantha,Nathan@yesenia.net,"[{'street': 'Douglas Extension', 'suite': 'Sui...",1-463-123-4447,ramiro.info,Romaguera-Jacobson,Face to face bifurcated interface,e-enable strategic applications
3,4,Patricia Lebsack,Karianne,Julianne.OConner@kory.org,"[{'street': 'Hoeger Mall', 'suite': 'Apt. 692'...",493-170-9623 x156,kale.biz,Robel-Corkery,Multi-tiered zero tolerance productivity,transition cutting-edge web services
4,5,Chelsey Dietrich,Kamren,Lucio_Hettinger@annie.ca,"[{'street': 'Skiles Walks', 'suite': 'Suite 35...",(254)954-1289,demarco.info,Keebler LLC,User-centric fault-tolerant solution,revolutionize end-to-end systems


In [43]:
# write and save json file
user_df.to_json('data out/json/users.json')
user_df.to_csv('data out/csv/users.csv')

In [44]:
# another json file
path = 'data/json/artists.json'

with open(path) as file:
    json_dict = json.load(file)

In [45]:
artist_df = json_normalize(json_dict)
artist_df.head()

Unnamed: 0,name,years,genre,nationality,bio
0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,"[{'full name': 'Amedeo Clemente Modigliani', '..."
1,Vasiliy Kandinskiy,1866 - 1944,"Expressionism,Abstractionism",Russian,[{'full name': 'Wassily Wassilyevich Kandinsky...
2,Diego Rivera,1886 - 1957,"Social Realism,Muralism",Mexican,[{'full name': 'Diego María de la Concepción J...
3,Claude Monet,1840 - 1926,Impressionism,French,"[{'full name': 'Oscar-Claude Monet', 'pronunci..."
4,Rene Magritte,1898 - 1967,"Surrealism,Impressionism",Belgian,[{'full name': 'René François Ghislain Magritt...


In [46]:
# same result
df_a = pd.read_json(path)
df_a.head()

Unnamed: 0,name,years,genre,nationality,bio
0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,"[{'full name': 'Amedeo Clemente Modigliani', '..."
1,Vasiliy Kandinskiy,1866 - 1944,"Expressionism,Abstractionism",Russian,[{'full name': 'Wassily Wassilyevich Kandinsky...
2,Diego Rivera,1886 - 1957,"Social Realism,Muralism",Mexican,[{'full name': 'Diego María de la Concepción J...
3,Claude Monet,1840 - 1926,Impressionism,French,"[{'full name': 'Oscar-Claude Monet', 'pronunci..."
4,Rene Magritte,1898 - 1967,"Surrealism,Impressionism",Belgian,[{'full name': 'René François Ghislain Magritt...


In [47]:
bio_df = json_normalize(json_dict, record_path='bio', meta=['name'])
bio_df.head()

Unnamed: 0,full name,pronunciation,life span,info,wikipedia,paintings,name
0,Amedeo Clemente Modigliani,[ameˈdɛːo modiʎˈʎaːni],12 July 1884 – 24 January 1920,was an Italian Jewish painter and sculptor who...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193,Amedeo Modigliani
1,Wassily Wassilyevich Kandinsky,"Васи́лий Васи́льевич Канди́нский, tr. Vasíliy ...",16 December [O.S. 4 December] 1866 – 13 Decemb...,was a Russian painter and art theorist.,http://en.wikipedia.org/wiki/Wassily_Kandinsky,88,Vasiliy Kandinskiy
2,Diego María de la Concepción Juan Nepomuceno E...,[ˈdjeɣo riˈβeɾa],"December 8, 1886 – November 24, 1957",was a prominent Mexican painter. His large fre...,http://en.wikipedia.org/wiki/Diego_Rivera,70,Diego Rivera
3,Oscar-Claude Monet,[klod mɔnɛ],14 November 1840 – 5 December 1926,"was a French painter, a founder of French Impr...",http://en.wikipedia.org/wiki/Claude_Monet,73,Claude Monet
4,René François Ghislain Magritte,[ʁəne fʁɑ̃swa ɡilɛ̃ maɡʁit],21 November 1898 – 15 August 1967,Was a Belgian Surrealist artist. He became wel...,http://en.wikipedia.org/wiki/René_Magritte,194,Rene Magritte


# Reading data from realational databases

Database that i used = https://www.sqlitetutorial.net/sqlite-sample-database/

In pandas there is no way to read a relational databases file, but pandas can read sql file. So the best possible way is using python sqlite3 library and assign it sql then to pandas DataFrame

In [48]:
# import library 
import sqlite3

In [49]:
# make a database connection
db_path = 'data/db/chinook.db'
conn = sqlite3.connect(db_path)

In [50]:
# make a cursor in order to do executing query
cur = conn.cursor()

In [51]:
# Simple query execute
cur.execute('SELECT * FROM employees  LIMIT 5') # select first 5 data for all column from employees table

<sqlite3.Cursor at 0xa60f7e0>

In [52]:
# using fetchall method in order to view the query result
results = cur.fetchall()
results

[(1,
  'Adams',
  'Andrew',
  'General Manager',
  None,
  '1962-02-18 00:00:00',
  '2002-08-14 00:00:00',
  '11120 Jasper Ave NW',
  'Edmonton',
  'AB',
  'Canada',
  'T5K 2N1',
  '+1 (780) 428-9482',
  '+1 (780) 428-3457',
  'andrew@chinookcorp.com'),
 (2,
  'Edwards',
  'Nancy',
  'Sales Manager',
  1,
  '1958-12-08 00:00:00',
  '2002-05-01 00:00:00',
  '825 8 Ave SW',
  'Calgary',
  'AB',
  'Canada',
  'T2P 2T3',
  '+1 (403) 262-3443',
  '+1 (403) 262-3322',
  'nancy@chinookcorp.com'),
 (3,
  'Peacock',
  'Jane',
  'Sales Support Agent',
  2,
  '1973-08-29 00:00:00',
  '2002-04-01 00:00:00',
  '1111 6 Ave SW',
  'Calgary',
  'AB',
  'Canada',
  'T2P 5M5',
  '+1 (403) 262-3443',
  '+1 (403) 262-6712',
  'jane@chinookcorp.com'),
 (4,
  'Park',
  'Margaret',
  'Sales Support Agent',
  2,
  '1947-09-19 00:00:00',
  '2003-05-03 00:00:00',
  '683 10 Street SW',
  'Calgary',
  'AB',
  'Canada',
  'T2P 5G3',
  '+1 (403) 263-4423',
  '+1 (403) 263-4289',
  'margaret@chinookcorp.com'),
 (5,


In [53]:
# assign it to employee_df
employee_df = pd.DataFrame(results)

In [54]:
employee_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,Adams,Andrew,General Manager,,1962-02-18 00:00:00,2002-08-14 00:00:00,11120 Jasper Ave NW,Edmonton,AB,Canada,T5K 2N1,+1 (780) 428-9482,+1 (780) 428-3457,andrew@chinookcorp.com
1,2,Edwards,Nancy,Sales Manager,1.0,1958-12-08 00:00:00,2002-05-01 00:00:00,825 8 Ave SW,Calgary,AB,Canada,T2P 2T3,+1 (403) 262-3443,+1 (403) 262-3322,nancy@chinookcorp.com
2,3,Peacock,Jane,Sales Support Agent,2.0,1973-08-29 00:00:00,2002-04-01 00:00:00,1111 6 Ave SW,Calgary,AB,Canada,T2P 5M5,+1 (403) 262-3443,+1 (403) 262-6712,jane@chinookcorp.com
3,4,Park,Margaret,Sales Support Agent,2.0,1947-09-19 00:00:00,2003-05-03 00:00:00,683 10 Street SW,Calgary,AB,Canada,T2P 5G3,+1 (403) 263-4423,+1 (403) 263-4289,margaret@chinookcorp.com
4,5,Johnson,Steve,Sales Support Agent,2.0,1965-03-03 00:00:00,2003-10-17 00:00:00,7727B 41 Ave,Calgary,AB,Canada,T3B 1Y7,1 (780) 836-9987,1 (780) 836-9543,steve@chinookcorp.com


In [55]:
# before move on, don't forget to close cursor and connection
cur.close()
conn.close()

# Read sql with pandas

read sql using read_sql function

In [56]:
# make a database connection
db_path = 'data/db/chinook.db'
conn = sqlite3.connect(db_path)

In [57]:
# make DataFrame using read_sql

# select all data for all row and column from employees table
employee_df = pd.read_sql('SELECT * FROM employees;', conn, index_col='EmployeeId', parse_dates=['BirthDate','HireDate']) 
employee_df.head()

Unnamed: 0_level_0,LastName,FirstName,Title,ReportsTo,BirthDate,HireDate,Address,City,State,Country,PostalCode,Phone,Fax,Email
EmployeeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Adams,Andrew,General Manager,,1962-02-18,2002-08-14,11120 Jasper Ave NW,Edmonton,AB,Canada,T5K 2N1,+1 (780) 428-9482,+1 (780) 428-3457,andrew@chinookcorp.com
2,Edwards,Nancy,Sales Manager,1.0,1958-12-08,2002-05-01,825 8 Ave SW,Calgary,AB,Canada,T2P 2T3,+1 (403) 262-3443,+1 (403) 262-3322,nancy@chinookcorp.com
3,Peacock,Jane,Sales Support Agent,2.0,1973-08-29,2002-04-01,1111 6 Ave SW,Calgary,AB,Canada,T2P 5M5,+1 (403) 262-3443,+1 (403) 262-6712,jane@chinookcorp.com
4,Park,Margaret,Sales Support Agent,2.0,1947-09-19,2003-05-03,683 10 Street SW,Calgary,AB,Canada,T2P 5G3,+1 (403) 263-4423,+1 (403) 263-4289,margaret@chinookcorp.com
5,Johnson,Steve,Sales Support Agent,2.0,1965-03-03,2003-10-17,7727B 41 Ave,Calgary,AB,Canada,T3B 1Y7,1 (780) 836-9987,1 (780) 836-9543,steve@chinookcorp.com


In [58]:
employee_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 1 to 8
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   LastName    8 non-null      object        
 1   FirstName   8 non-null      object        
 2   Title       8 non-null      object        
 3   ReportsTo   7 non-null      float64       
 4   BirthDate   8 non-null      datetime64[ns]
 5   HireDate    8 non-null      datetime64[ns]
 6   Address     8 non-null      object        
 7   City        8 non-null      object        
 8   State       8 non-null      object        
 9   Country     8 non-null      object        
 10  PostalCode  8 non-null      object        
 11  Phone       8 non-null      object        
 12  Fax         8 non-null      object        
 13  Email       8 non-null      object        
dtypes: datetime64[ns](2), float64(1), object(11)
memory usage: 608.0+ bytes


In [59]:
employee_df.describe()

Unnamed: 0,ReportsTo
count,7.0
mean,2.857143
std,2.193063
min,1.0
25%,1.5
50%,2.0
75%,4.0
max,6.0


In [60]:
employee_df.isnull().sum()

LastName      0
FirstName     0
Title         0
ReportsTo     1
BirthDate     0
HireDate      0
Address       0
City          0
State         0
Country       0
PostalCode    0
Phone         0
Fax           0
Email         0
dtype: int64

In [61]:
employee_df[employee_df['ReportsTo'].isnull()]

Unnamed: 0_level_0,LastName,FirstName,Title,ReportsTo,BirthDate,HireDate,Address,City,State,Country,PostalCode,Phone,Fax,Email
EmployeeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Adams,Andrew,General Manager,,1962-02-18,2002-08-14,11120 Jasper Ave NW,Edmonton,AB,Canada,T5K 2N1,+1 (780) 428-9482,+1 (780) 428-3457,andrew@chinookcorp.com


In [62]:
employee_df = employee_df.fillna(0)

In [63]:
employee_df.isnull().sum()

LastName      0
FirstName     0
Title         0
ReportsTo     0
BirthDate     0
HireDate      0
Address       0
City          0
State         0
Country       0
PostalCode    0
Phone         0
Fax           0
Email         0
dtype: int64

In [64]:
employee_df

Unnamed: 0_level_0,LastName,FirstName,Title,ReportsTo,BirthDate,HireDate,Address,City,State,Country,PostalCode,Phone,Fax,Email
EmployeeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Adams,Andrew,General Manager,0.0,1962-02-18,2002-08-14,11120 Jasper Ave NW,Edmonton,AB,Canada,T5K 2N1,+1 (780) 428-9482,+1 (780) 428-3457,andrew@chinookcorp.com
2,Edwards,Nancy,Sales Manager,1.0,1958-12-08,2002-05-01,825 8 Ave SW,Calgary,AB,Canada,T2P 2T3,+1 (403) 262-3443,+1 (403) 262-3322,nancy@chinookcorp.com
3,Peacock,Jane,Sales Support Agent,2.0,1973-08-29,2002-04-01,1111 6 Ave SW,Calgary,AB,Canada,T2P 5M5,+1 (403) 262-3443,+1 (403) 262-6712,jane@chinookcorp.com
4,Park,Margaret,Sales Support Agent,2.0,1947-09-19,2003-05-03,683 10 Street SW,Calgary,AB,Canada,T2P 5G3,+1 (403) 263-4423,+1 (403) 263-4289,margaret@chinookcorp.com
5,Johnson,Steve,Sales Support Agent,2.0,1965-03-03,2003-10-17,7727B 41 Ave,Calgary,AB,Canada,T3B 1Y7,1 (780) 836-9987,1 (780) 836-9543,steve@chinookcorp.com
6,Mitchell,Michael,IT Manager,1.0,1973-07-01,2003-10-17,5827 Bowness Road NW,Calgary,AB,Canada,T3B 0C5,+1 (403) 246-9887,+1 (403) 246-9899,michael@chinookcorp.com
7,King,Robert,IT Staff,6.0,1970-05-29,2004-01-02,590 Columbia Boulevard West,Lethbridge,AB,Canada,T1K 5N8,+1 (403) 456-9986,+1 (403) 456-8485,robert@chinookcorp.com
8,Callahan,Laura,IT Staff,6.0,1968-01-09,2004-03-04,923 7 ST NW,Lethbridge,AB,Canada,T1H 1Y8,+1 (403) 467-3351,+1 (403) 467-8772,laura@chinookcorp.com


In [65]:
# write to csv and json file
csv_path = 'data out/csv/employees.csv'
json_path = 'data out/json/employees.json'

employee_df.to_csv(csv_path)
employee_df.to_json(json_path)

In [66]:
# close connection
conn.close()

# Read SQL with .sql file
In order to read SQL file, the database connection is required. But, since i am not using any relational database so i will make database connection to database in memory

In [67]:
# make a connection to database with memory
conn = sqlite3.connect(':memory:')

# create new cursor
cursor = conn.cursor()

In [68]:
# open sql file
path = 'data/sql/cryptos.sql'

cursor.executescript(open(path, 'r').read())

<sqlite3.Cursor at 0xa8b5de0>

In [69]:
# make data frame from cryptos.sql
crypto_df = pd.read_sql('''SELECT cryptocoins_cryptocurrency.name AS coin_name, cryptocoins_exchange.name AS exchange, symbol, price_usd, percent_change_7d
                            FROM cryptocoins_cryptocurrency
                            JOIN cryptocoins_exchange
                            ON cryptocoins_cryptocurrency.exchange_id = cryptocoins_exchange.id''', conn)

In [70]:
crypto_df.head()

Unnamed: 0,coin_name,exchange,symbol,price_usd,percent_change_7d
0,Bitcoin,Bitstamp,BTC,8707.37,-5.91
1,Ethereum,Bitstamp,ETH,186.5,-0.66
2,XRP,Bitstamp,XRP,0.27,-8.22
3,Bitcoin Cash,Binance,BCH,278.92,-4.76
4,Tether,Bitstamp,USDT,1.01,0.45


In [71]:
crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   coin_name          100 non-null    object 
 1   exchange           100 non-null    object 
 2   symbol             100 non-null    object 
 3   price_usd          100 non-null    float64
 4   percent_change_7d  100 non-null    float64
dtypes: float64(2), object(3)
memory usage: 2.8+ KB


In [72]:
crypto_df.describe()

Unnamed: 0,price_usd,percent_change_7d
count,100.0,100.0
mean,122.4742,0.7523
std,885.728397,14.05091
min,0.0,-17.84
25%,0.0875,-5.165
50%,0.74,-1.765
75%,3.1875,2.47
max,8707.37,89.14
