In [1]:
import pandas as pd
from pandasql import sqldf, load_meat, load_births
pysqldf = lambda q: sqldf(q)
import numpy as np

# Defining the sample table

In [2]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['Munich', 'London', 'Amsterdam', 'Paris', 'Berlin'],
    'Income': [55000, 80000, 80000, 85000, 60000]
}
df = pd.DataFrame(data)  #For one table queries
df1 = pd.DataFrame(data) #For two table queries


data2 = {
    'Name': ['Alice', 'David', 'Fiona', 'George'],
    'Profession': ['Engineer', 'Doctor', 'Professor', 'Policeman']
}
df2 = pd.DataFrame(data2) #For two table queries
df2


data3 = {
    'Name': ['Jon', 'Laura', 'Jordi', 'Taylor', 'Rosalia'],
    'Age': [33, 21, 55, 40, 25],
    'City': ['Chicago', 'Milano', 'Barcelona', 'Taipei', 'Madrid'],
    'Income': [110000, 80000, 55000, 60000, 50000]
}

df_rest = pd.DataFrame(data3) #For UNION queries

# POST 1 - 19/02/2024 
You can find the post [here!](https://x.com/rfeers/status/1759502326593953816?s=20)
1. **SIMPLE UNION:** Allows us to unify two tables with the same structure. Duplicated records are erased. 
2. **UNION WITH MULTIPLE COLUMNS:** We can first select the columns of interest and then unify the corresponding outputs. 

In [3]:
# SIMPLE UNION

pysqldf("""
SELECT * FROM df
UNION
SELECT * FROM df_rest
""")

# Assuming df and df_rest are two DataFrames that you want to unify
result = pd.concat([df, df_rest]).drop_duplicates().reset_index(drop=True)



# UNION with Multiple Columns

pysqldf("""
SELECT Name, City, Income FROM df
UNION
SELECT Name, City, Income FROM df_rest

""")

result = pd.concat([df[['Name', 'City', 'Income']], df_rest[['Name', 'City', 'Income']]]).drop_duplicates().reset_index(drop=True)

# POST 2 - 26/02/2024
You can find the post [here!](https://x.com/rfeers/status/1762043714170769615?s=20)
1. **UNION ALL**: Allows to unify all the records of two tables with the same structure. 
2. **UNION WITH SORTING**: Once we unify all the records of two tables with the same structure, we can sort the final output.

In [4]:
# UNION ALL (keeps all duplicates)

pysqldf("""
SELECT * FROM df
UNION ALL
SELECT * FROM df_rest

""")

result = pd.concat([df, df_rest]).reset_index(drop=True)



# UNION with Sorting

pysqldf("""
SELECT * FROM df
UNION ALL
SELECT * FROM df_rest
ORDER BY Income

""")

result = pd.concat([df, df_rest]).drop_duplicates().drop_duplicates().sort_values('Income').reset_index(drop=True)

# POST 3 - 4/03/2024
You can find the post [here!](https://x.com/rfeers/status/1764620049980203174?s=20)
1. **INNER JOIN**: Merges the overlapping records of two tables (records that exist in both tables) 
2. **LEFT JOIN**: Merges the information of a second table (right table) to a first one (left table). Only those records that exist in the first or LEFT table. 

In [5]:
# INNER JOIN

pysqldf("""
        SELECT *
        FROM df1
        INNER JOIN df2 
             ON df1.Name = df2.Name
""")

result_inner_join = pd.merge(df1, df2, on='Name', how='inner')




# LEFT JOIN

pysqldf("""
        SELECT 
                df1.Name, 
                df1.Age, 
                df1.City, 
                df1.Income, 
                df2.Profession
        FROM df1
        LEFT JOIN df2 
             ON df1.Name = df2.Name
""")

result_left_join = pd.merge(df1, df2[['Name', 'Profession']], on='Name', how='left')

# POST 4 - 11/03/2024
You can find the post [here!](https://x.com/rfeers/status/1767096817785921879?s=20)
1. **RIGHT JOIN**: Merges the information of a second table (left table) to a first one (right table). Only those records that exist in the first or RIGHT table. 
2. **FULL JOIN**: Merges all the info of two tables (all existing records)

In [6]:
# RIGHT JOIN

pysqldf("""
        SELECT 
                df2.Name, 
                df1.Age, 
                df1.City, 
                df1.Income, 
                df2.Profession
        FROM df1
        RIGHT JOIN df2 
             ON df1.Name = df2.Name
""")

result_right_join = pd.merge(df1[['Name', 'Age', 'City', 'Income']], df2, on='Name', how='right')


# FULL JOIN

pysqldf("""
        SELECT 
                COALESCE(df1.Name, df2.Name) AS Name, 
                df1.Age, df1.City, df1.Income, df2.Profession
        FROM df1
        FULL OUTER JOIN df2 
             ON df1.Name = df2.Name;
""")

result_full_join = pd.merge(df1, df2, on='Name', how='outer')


# POST 5 - 02/04/2024
You can find the post [here!](https://x.com/rfeers/status/1775095868749951075?s=20)
1. **Querying a Table:** We can call all the records of a existing table. 
2. **Selecting specific columns:** We can just select those columns of interest.


In [7]:
# Querying a Table
df

pysqldf("""
        SELECT 
        *
        FROM df 
""")

# Selecting specific columns
df[["Name","Age","City"]]

pysqldf("""
        SELECT 
        Name, Age, City
        FROM df 
""")

Unnamed: 0,Name,Age,City
0,Alice,25,Munich
1,Bob,30,London
2,Charlie,35,Amsterdam
3,David,40,Paris
4,Eva,45,Berlin


# POST 6 - 08/04/2024
Yet to come...
1. **GROUP BY single column:** We can group by a specific column
2. **SORT BY single column:** We can sort by a specific columns

In [8]:
# GROUP BY 
#df['Income'] = pd.to_numeric(df['Income'], errors='coerce')
avg_income_by_city = df.groupby("City")["Income"].mean()

pysqldf("""
        SELECT 
        City,
        AVG(Income) AS AVG_income
        FROM df 
        GROUP BY 1
""")

# SORT BY SINGLE COLUMN
df.sort_values(by=["Income"], ascending=True)

pysqldf("""
        SELECT 
        *
        FROM df 
        ORDER BY Income ASC
""")


Unnamed: 0,Name,Age,City,Income
0,Alice,25,Munich,55000
1,Eva,45,Berlin,60000
2,Bob,30,London,80000
3,Charlie,35,Amsterdam,80000
4,David,40,Paris,85000


# POST 7 - 15/04/2024
Yet to come...
1. **Filtering - WHERE:** Allows us to filter the output of a table given a static condition with WHERE. 
2. **Filtering and selecting - WHERE with a single condition and selecting columns** Allows us to filter while selecting columns. 

In [9]:
# FILTER
df[df["City"]=="London"]


pysqldf("""
        SELECT 
        * 
        FROM df 
        WHERE CITY = "London" 
""")

# SELECT SPECIFIC COLUMNS AND FILTER
df[df["City"]=="London"][["Name","Income"]]


pysqldf("""
        SELECT 
        Name, 
        Income
        FROM df 
        WHERE CITY = "London" 
        """)


Unnamed: 0,Name,Income
0,Bob,80000


# POST 8 - 22/04/2024
Yet to come...

3. **Filtering multiple conditions - WHERE with multiple single conditions and selecting columns** Allows us to filter with a static condition with the WHERE command while selecting the columns of interest. 
4. **Filtering after computing - HAVING filtering after computing** Allows us to filter with a condition after computing with the HAVING command. 

In [10]:

# MULTIPLE FILTERS
# Creating a condition for filtering
condition = (df["City"] == "London") & (df["Name"] == "David")

# Applying the condition and selecting specific columns
filtered_df = df[condition][["Name", "City", "Income"]]


pysqldf("""
        SELECT 
        Name, 
        City,
        Income
        FROM df 
        WHERE CITY = "London" 
          AND Name = "David"
        """)

# HAVING

#Computing
grouped_df = df.groupby('City')['Income'].mean().reset_index()
# Keeping only the records for London
london_avg_income = grouped_df[grouped_df['City'] == 'London']

pysqldf("""
        SELECT 
        City,
        AVG(Income) AS AVG_income
        FROM df 
        GROUP BY City
        HAVING CITY = "London"
        """)



Unnamed: 0,City,AVG_income
0,London,80000.0
