## Sqlite + Python: data cleaning functions using jupyter notebook

### Load and read data

In [1]:
# import libraries
import pandas as pd

# to create database
import sqlite3 as db

In [2]:
# load data file
df = pd.read_csv("trim.csv")

# read data
df

Unnamed: 0,first_name,last_name,age,weight
0,Anna,go,5.0,30.3
1,Bill,jk,10.0,49.2
2,Carl,kkl,15.0,59.4
3,Dig,mkn,20.0,70.8
4,fig,xza,0.0,0.0
5,gos,gos,,112.134
6,gos,gos,,112.134


### Sqlite connect to database

In [3]:
# create and connect to a database

connection = db.connect('trim.db')

In [4]:
# df.to_sql('table_name', connection_name) to create a table in database to run queries
# load data to sqlite: fail, replace, append

df.to_sql('trim_table', connection)

7

In [5]:
# load sql to ipython

%load_ext sql

In [6]:
# refer to the database

%sql sqlite:///trim.db

'Connected: @trim.db'

In [7]:
%%sql

SELECT * FROM trim_table;

 * sqlite:///trim.db
Done.


index,first_name,last_name,age,weight
0,Anna,go,5.0,30.3
1,Bill,jk,10.0,49.2
2,Carl,kkl,15.0,59.4
3,Dig,mkn,20.0,70.8
4,fig,xza,0.0,0.0
5,gos,gos,,112.134
6,gos,gos,,112.134


### Trim

In [8]:
%%sql

SELECT TRIM(first_name) AS trimmed_name
FROM trim_table;

 * sqlite:///trim.db
Done.


trimmed_name
Anna
Bill
Carl
Dig
fig
gos
gos


### Upper & Lower

In [9]:
%%sql

SELECT UPPER('fig'), LOWER('FIG');

 * sqlite:///trim.db
Done.


UPPER('fig'),LOWER('FIG')
FIG,fig


### Replace

In [10]:
%%sql

SELECT REPLACE(weight, 30.3, 50) AS updated_weight
FROM trim_table;

 * sqlite:///trim.db
Done.


updated_weight
50.0
49.2
59.4
70.8
0.0
112.134
112.134


### NULLIF

In [11]:
%%sql

SELECT Age, NULLIF(Age, 0) AS exact_age
FROM trim_table;

 * sqlite:///trim.db
Done.


age,exact_age
5.0,5.0
10.0,10.0
15.0,15.0
20.0,20.0
0.0,
,
,


### Substring

> extract first three characters from first_name

In [12]:
%%sql

SELECT SUBSTRING(first_name, 1, 5) AS extract_name
FROM trim_table;

 * sqlite:///trim.db
Done.


extract_name
An
Carl
Dig
fig
gos
gos


### Length

In [13]:
%%sql

SELECT last_name
FROM trim_table
WHERE LENGTH(last_name) > 2;

 * sqlite:///trim.db
Done.


last_name
kkl
mkn
xza
gos
gos


In [14]:
%%sql

SELECT first_name, weight, ROUND(weight, -1) AS rounded_weight
FROM trim_table;

 * sqlite:///trim.db
Done.


first_name,weight,rounded_weight
Anna,30.3,30.0
Bill,49.2,49.0
Carl,59.4,59.0
Dig,70.8,71.0
fig,0.0,0.0
gos,112.134,112.0
gos,112.134,112.0


### Identify missing ISNULL & IS NOT NULL

> index 5 & 6: age is blank fields

In [15]:
%%sql

SELECT * FROM trim_table WHERE age IS NULL;

 * sqlite:///trim.db
Done.


index,first_name,last_name,age,weight
5,gos,gos,,112.134
6,gos,gos,,112.134


In [16]:
%%sql

SELECT * FROM trim_table WHERE age IS NOT NULL;

 * sqlite:///trim.db
Done.


index,first_name,last_name,age,weight
0,Anna,go,5.0,30.3
1,Bill,jk,10.0,49.2
2,Carl,kkl,15.0,59.4
3,Dig,mkn,20.0,70.8
4,fig,xza,0.0,0.0


### Remove duplicates

In [17]:
%%sql

SELECT DISTINCT first_name, last_name
FROM trim_table;

 * sqlite:///trim.db
Done.


first_name,last_name
Anna,go
Bill,jk
Carl,kkl
Dig,mkn
fig,xza
gos,gos
