In [13]:
import pandas as pd
from pandasql import sqldf, load_meat, load_births
pysqldf = lambda q: sqldf(q)
import numpy as np

# Defining the sample table

In [14]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['Munich', 'London', 'Amsterdam', 'Paris', 'Berlin'],
    'Income': ['55000', '80000', '80000', '85000', '60000']
}
df = pd.DataFrame(data)  #For one table queries
df1 = pd.DataFrame(data) #For two table queries


data2 = {
    'Name': ['Alice', 'David', 'Fiona', 'George'],
    'Profession': ['Engineer', 'Doctor', 'Professor', 'Policeman']
}
df2 = pd.DataFrame(data2) #For two table queries
df2


data3 = {
    'Name': ['Jon', 'Laura', 'Jordi', 'Taylor', 'Rosalia'],
    'Age': [33, 21, 55, 40, 25],
    'City': ['Chicago', 'Milano', 'Barcelona', 'Taipei', 'Madrid'],
    'Income': ['110000', '80000', '55000', '60000', '50000']
}

df_rest = pd.DataFrame(data3) #For UNION queries

# Week 54 - 2023
1. Querying a Table
2. Selecting specific columns
3. GROUP BY
4. SORT BY

In [15]:
#1__________________________________________________________________ Querying a Table
df

pysqldf("""
        SELECT 
        *
        FROM df 
""")

#2__________________________________________________________________ Selecting specific columns
df[["Name","Age","City"]]

pysqldf("""
        SELECT 
        Name, Age, City
        FROM df 
""")

#3__________________________________________________________________ GROUP BY 
df.groupby("City")["Income"].mean()

pysqldf("""
        SELECT 
        City,
        AVG(Income) AS AVG_income
        FROM df 
        GROUP BY 1
""")

#4__________________________________________________________________ SORT BY SINGLE COLUMN
df.sort_values(by=["Income"], ascending=True)

pysqldf("""
        SELECT 
        *
        FROM df 
        ORDER BY Income ASC
""")


Unnamed: 0,Name,Age,City,Income
0,Alice,25,Munich,55000
1,Eva,45,Berlin,60000
2,Bob,30,London,80000
3,Charlie,35,Amsterdam,80000
4,David,40,Paris,85000


# Week 1 - 2024
1. Filtering - WHERE
2. Filtering and selecting - WHERE with a single condition and selecting columns
3. Filtering multiple conditions - WHERE with multiple single conditions and selecting columns
4. Filtering after computing - HAVING filtering after computing

In [16]:
#__________________________________________________________________ FILTER
df[df["City"]=="London"]


pysqldf("""
        SELECT 
        * 
        FROM df 
        WHERE CITY = "London" 
""")

#__________________________________________________________________ SELECT SPECIFIC COLUMNS AND FILTER
df[df["City"]=="London"][["Name","Income"]]


pysqldf("""
        SELECT 
        Name, 
        Income
        FROM df 
        WHERE CITY = "London" 
        """)

#__________________________________________________________________ MULTIPLE FILTERS
# Creating a condition for filtering
condition = (df["City"] == "London") & (df["Name"] == "David")

# Applying the condition and selecting specific columns
filtered_df = df[condition][["Name", "City", "Income"]]


pysqldf("""
        SELECT 
        Name, 
        City,
        Income
        FROM df 
        WHERE CITY = "London" 
          AND Name = "David"
        """)

#__________________________________________________________________ HAVING

#Computing
grouped_df = df.groupby('City')['Income'].mean().reset_index()
# Keeping only the records for London
london_avg_income = grouped_df[grouped_df['City'] == 'London']

pysqldf("""
        SELECT 
        City,
        AVG(Income) AS AVG_income
        FROM df 
        GROUP BY City
        HAVING CITY = "London"
        """)



Unnamed: 0,City,AVG_income
0,London,80000.0


# Week 2 - 2024
1. Limiting Results
2. Data Type Conversion
3. Updating Rows
4. Deleting Rows

In [17]:
#__________________________________________________________________ FILTER
df.head(2)


pysqldf("""
        SELECT 
        * 
        FROM df 
        LIMIT 2 
""")


#__________________________________________________________________ Data Type Conversion
df['age_text'] = df['Age'].astype(object)


pysqldf("""
        SELECT
        *,
        CAST(Age AS string) AS age_text
        FROM df
""")

#__________________________________________________________________ Updating Rows


#pysqldf("""
#        UPDATE df
#        SET age = 70
#        WHERE name = "David"
#        """
#       )

df.loc[df['Name'] == "David", 'Age'] = 70

#__________________________________________________________________ Deleting Rows

df.drop(df[df['Name'] == "David"].index, inplace=True)

pysqldf("""
        DELETE FROM df WHERE name = "David"
        """
       )

# Week 3 - 2024
1. INNER JOIN
2. LEFT JOIN
3. RIGHT JOIN
4. FULL JOIN

In [18]:
# INNER JOIN

pysqldf("""
        SELECT *
        FROM df1
        INNER JOIN df2 
             ON df1.Name = df2.Name
""")

result_inner_join = pd.merge(df1, df2, on='Name', how='inner')




# LEFT JOIN

pysqldf("""
        SELECT 
                df1.Name, 
                df1.Age, 
                df1.City, 
                df1.Income, 
                df2.Profession
        FROM df1
        LEFT JOIN df2 
             ON df1.Name = df2.Name
""")

result_left_join = pd.merge(df1, df2[['Name', 'Profession']], on='Name', how='left')



# RIGHT JOIN

pysqldf("""
        SELECT 
                df2.Name, 
                df1.Age, 
                df1.City, 
                df1.Income, 
                df2.Profession
        FROM df1
        RIGHT JOIN df2 
             ON df1.Name = df2.Name
""")

result_right_join = pd.merge(df1[['Name', 'Age', 'City', 'Income']], df2, on='Name', how='right')


# FULL JOIN

pysqldf("""
        SELECT 
                COALESCE(df1.Name, df2.Name) AS Name, 
                df1.Age, df1.City, df1.Income, df2.Profession
        FROM df1
        FULL OUTER JOIN df2 
             ON df1.Name = df2.Name;
""")

result_full_join = pd.merge(df1, df2, on='Name', how='outer')
result_full_join







Unnamed: 0,Name,Age,City,Income,Profession
0,Alice,25.0,Munich,55000.0,Engineer
1,Bob,30.0,London,80000.0,
2,Charlie,35.0,Amsterdam,80000.0,
3,David,40.0,Paris,85000.0,Doctor
4,Eva,45.0,Berlin,60000.0,
5,Fiona,,,,Professor
6,George,,,,Policeman


# Week 4 - 2024
1. Simple UNION
2. UNION with Multiple Columns
3. UNION ALL (keeps all duplicates)
4. UNION with Sorting

In [19]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['Munich', 'London', 'Amsterdam', 'Paris', 'Berlin'],
    'Income': ['55000', '80000', '80000', '85000', '60000']
}
df = pd.DataFrame(data)  #For one table queries
df1 = pd.DataFrame(data) #For two table queries


data2 = {
    'Name': ['Alice', 'David', 'Fiona', 'George'],
    'Profession': ['Engineer', 'Doctor', 'Professor', 'Policeman']
}
df2 = pd.DataFrame(data2) #For two table queries
df2


data3 = {
    'Name': ['Jon', 'Laura', 'Jordi', 'Taylor', 'Rosalia'],
    'Age': [33, 21, 55, 40, 25],
    'City': ['Chicago', 'Milano', 'Barcelona', 'Taipei', 'Madrid'],
    'Income': ['110000', '80000', '55000', '60000', '50000']
}

df_rest = pd.DataFrame(data3) #For UNION queries

In [20]:
# SIMPLE UNION

pysqldf("""
SELECT * FROM df
UNION
SELECT * FROM df_rest
""")

# Assuming df and df_rest are two DataFrames that you want to unify
result = pd.concat([df, df_rest]).drop_duplicates().reset_index(drop=True)



# UNION with Multiple Columns

pysqldf("""
SELECT Name, City, Income FROM df
UNION
SELECT Name, City, Income FROM df_rest

""")

result = pd.concat([df[['Name', 'City', 'Income']], df_rest[['Name', 'City', 'Income']]]).drop_duplicates().reset_index(drop=True)




# UNION ALL (keeps all duplicates)

pysqldf("""
SELECT * FROM df
UNION ALL
SELECT * FROM df_rest

""")

result = pd.concat([df, df_rest]).reset_index(drop=True)



# UNION with Sorting

pysqldf("""
SELECT * FROM df
UNION ALL
SELECT * FROM df_rest
ORDER BY Income

""")

result = pd.concat([df, df_rest]).drop_duplicates().drop_duplicates().sort_values('Income').reset_index(drop=True)








# Week 5 - 2024
1. COUNT DISTINCT VALUES
2. COUNT TOTAL VALUES
3. GROUP BY MULTIPLE COLUMNS
4. SORT BY SINGLE MULTIPLE COLUMNS

In [26]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice'],
    'Age': [25, 30, 35, 40, 25],
    'City': ['Munich', 'London', 'London', 'Munich', 'Munich'],
    'Income': ['55000', '80000', '90000', '85000', '60000']
}
df = pd.DataFrame(data)  #For one table queries
df

Unnamed: 0,Name,Age,City,Income
0,Alice,25,Munich,55000
1,Bob,30,London,80000
2,Charlie,35,London,90000
3,David,40,Munich,85000
4,Alice,25,Munich,60000


In [22]:
#1__________________________________________________________________ COUNT DISTINCT VALUES
df.City.nunique()

pysqldf("""
        SELECT 
        COUNT(DISTINCT City) 
        FROM df 
""")

#2__________________________________________________________________ COUNT TOTAL VALUES
df.size

pysqldf("""
        SELECT 
        COUNT(City) 
        FROM df 
""")

#3__________________________________________________________________ GROUP BY MULTIPLE COLUMNS

df.groupby(['City', 'Age']).agg(
    AVG_income=('Income', 'mean')
).reset_index()

pysqldf("""
        SELECT 
        City,
        Age,
        AVG(Income) AS AVG_income
        FROM df 
        GROUP BY 1, 2
""")

#4__________________________________________________________________ SORT BY SINGLE MULTIPLE COLUMNS
df.sort_values(by=[df.columns[3], df.columns[2]], ascending=[True, False])

pysqldf("""
        SELECT 
        *
        FROM df 
        ORDER BY 4 ASC, 3
""")


Unnamed: 0,Name,Age,City,Income
0,Alice,25,Munich,55000
1,Alice,25,Munich,55000
2,Bob,30,London,80000
3,Charlie,35,London,80000
4,David,40,Munich,85000


# Week 6 - 2024
1. CALCULATING COLUMN SUM
2. COUNTING ROWS
3. FINDING MAXIMUM AND MINIMUM
4. USING DISTINCT

In [23]:
#1__________________________________________________________________ 1. CALCULATING COLUMN SUM
df['Income'].sum()

pysqldf("""
    SELECT 
        SUM(Income) 
    FROM df
""")

#2__________________________________________________________________ 2. COUNT TOTAL VALUES
len(df)
df.shape[0]

pysqldf("""
SELECT 
    COUNT(*) 
FROM df
""")

#3__________________________________________________________________ 3. GROUP BY MULTIPLE COLUMNS

df['Income'].max()
df['Income'].min()

pysqldf("""
    SELECT 
            MAX(Income) AS max_income, 
            MIN(Income) AS min_income
    FROM df
""")

#4__________________________________________________________________ 4. SORT BY SINGLE MULTIPLE COLUMNS
df['Name'].unique()

df.drop_duplicates(subset='Name')

pysqldf("""  
SELECT DISTINCT 
        Name 
FROM df  
""")


Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,David


# Week 7 - 2024
1. RANK
2. DENSE_RANK()
3. ROW_NUMBER
4. NTILE

In [24]:
#1__________________________________________________________________ 1.RANK
df['IncomeRank'] = df.groupby('City')['Income'].rank(method='average', ascending=False)


pysqldf("""
SELECT Name, Age, City, Income,
       RANK() OVER (PARTITION BY City ORDER BY Income DESC) AS IncomeRank
FROM df;
""")

#2__________________________________________________________________ 2. DENSE_RANK
df['IncomeDenseRank'] = df.groupby('City')['Income'].rank(method='min', ascending=False)



pysqldf("""
SELECT Name, Age, City, Income,
       DENSE_RANK() OVER (PARTITION BY City ORDER BY Income DESC) AS IncomeDenseRank
FROM df;
""")

#3__________________________________________________________________ 3. ROW_NUMBER
df['IncomeRowNumber'] = df.sort_values(by=['City', 'Income'], ascending=[True, False])\
                          .groupby('City')\
                          .cumcount() + 1



pysqldf("""
SELECT Name, Age, City, Income,
       ROW_NUMBER() OVER (PARTITION BY City ORDER BY Income DESC) AS IncomeRowNumber
FROM df;
""")

#4__________________________________________________________________ 4. NTILE
# Handle the groups with not enough unique values to split into quantiles
df['IncomeNtile'] = df.groupby('City')['Income'].transform(
    lambda x: pd.qcut(x, 2, labels=False, duplicates='drop') + 1
)


pysqldf("""  
SELECT Name, Age, City, Income,
       NTILE(2) OVER (PARTITION BY City ORDER BY Income DESC) AS IncomeNtile
FROM df;
""")


Unnamed: 0,Name,Age,City,Income,IncomeNtile
0,Bob,30,London,80000,1
1,Charlie,35,London,80000,2
2,David,40,Munich,85000,1
3,Alice,25,Munich,55000,1
4,Alice,25,Munich,55000,2
