# 1 Info 

In this notebook is explained how to query pandas data frames using SQL Syntax. 

# 2 Examples

## 2.1 Install and import the library "pandasql"

In [None]:
# install the package
# !pip install pandasql

In [1]:
from pandasql import sqldf

import the other libraries

In [2]:
import pandas as pd
import numpy as np

## 2.2 Generate sample data

In [3]:
np.random.seed(10)

In [4]:
my_df = pd.DataFrame({'gender':np.random.choice(a=['m','f', 'u'], size=20, p=[0.6,0.3, 0.1]),
                   'age':np.random.poisson(lam=25, size=20),
                   'score_a':np.random.randint(100, size=20),
                   'score_b':np.random.randint(100, size=20),
                   'score_c':np.random.randint(100, size=20)})
 
 
gender = pd.DataFrame({'gender':['m','f', 'u'], 'full':['male','female', 'unknown']})

In [6]:
gender

Unnamed: 0,gender,full
0,m,male
1,f,female
2,u,unknown


## 2.3 Main function - sqldf.sqldf 

The main function used in pandas sqldf.sqldf accepts two parameters:
* An SQL query string
* A set of session/enviroment variables (locals() or globlas())

the following command cna be type in order to avoid specifying it every time you wan to run a query

In [8]:
pysqldf = lambda q: sqldf(q, globals())

## 2.4  Select query to get an average score

the name of the table will be the name of how you save the dataFrame, and the name of the columns will be how you call them inside the query.

In [9]:
pysqldf("""
    select 
        Gender, count(*) as obs, avg(age) as avg_age, avg(score_a) as avg_score_a,
        avg(score_b) as avg_score_b, avg(score_c) as avg_score_c
    from 
        my_df
    group by 
        gender
""")

Unnamed: 0,gender,obs,avg_age,avg_score_a,avg_score_b,avg_score_c
0,f,9,23.888889,46.666667,66.555556,68.888889
1,m,9,25.666667,25.333333,35.777778,58.222222
2,u,2,24.5,55.5,79.5,38.0


## 2.5 Join the two tables

In [13]:
pysqldf("""
    select 
        * 
    from 
        gender
    inner join 
        (
            select 
                Gender, count(*) as obs, avg(age) as avg_age, avg(score_a) as avg_score_a,
                avg(score_b) as avg_score_b, avg(score_c) as avg_score_c
            from 
                my_df
            group by 
                gender
        ) b
    on 
        gender.gender = b.gender
""")

Unnamed: 0,gender,full,Gender,obs,avg_age,avg_score_a,avg_score_b,avg_score_c
0,m,male,m,9,25.666667,25.333333,35.777778,58.222222
1,f,female,f,9,23.888889,46.666667,66.555556,68.888889
2,u,unknown,u,2,24.5,55.5,79.5,38.0


## 2.6 you can save the query result into a variable as a dataFrame

In [15]:
data = pysqldf("""
    select 
        * 
    from 
        gender
    inner join 
        (
            select 
                Gender, count(*) as obs, avg(age) as avg_age, avg(score_a) as avg_score_a,
                avg(score_b) as avg_score_b, avg(score_c) as avg_score_c
            from 
                my_df
            group by 
                gender
        ) b
    on 
        gender.gender = b.gender
""")

In [16]:
data

Unnamed: 0,gender,full,Gender,obs,avg_age,avg_score_a,avg_score_b,avg_score_c
0,m,male,m,9,25.666667,25.333333,35.777778,58.222222
1,f,female,f,9,23.888889,46.666667,66.555556,68.888889
2,u,unknown,u,2,24.5,55.5,79.5,38.0
