In [7]:
import psycopg2
import random
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from faker import Faker

### Set up a connection to the PostgreSQL database

In [8]:
f = open("credentials\credentials_aws.txt")
lines=f.readlines()
host=lines[0][7:-1].strip()
database=lines[1][11:].strip()
user=lines[2][7:].strip()
password=lines[3][11:].strip()
port=lines[4][7:].strip()
url=lines[5][6:].strip()
f.close()

In [9]:
print("host:", host)
print("database:", database)
print("user:", user)
print("password:", password)
print("port:", port)
print("url:", url)

host: database-1.chaf71z5ycev.eu-north-1.rds.amazonaws.com
database: database-1 # Don't use this line for AWS
user: postgres
password: edemdb1234
port: 5432
url: postgresql://postgres:edemdb1234@database-1.chaf71z5ycev.eu-north-1.rds.amazonaws.com:5432/


In [10]:
def update_db(query):
    conn = psycopg2.connect(
    host=host,
    user=user,
    password=password,
    port=port
    )

    cur = conn.cursor()

    cur.execute(query)

    conn.commit()
    cur.close()
    conn.close()

    return "Database has been updated"

In [11]:
def check_db(query):
    engine = create_engine(url)
    
    return pd.read_sql_query(text(query), con=engine.connect())

### Create "categories" table

In [14]:
# update_db("DROP TABLE if exists categories cascade")

'Database has been updated'

In [15]:
# Create table
query = """
    CREATE TABLE categories(
        category_id SERIAL PRIMARY KEY,
        name VARCHAR(50)
    )
"""
update_db(query)

# Insert data
query = """
    INSERT INTO categories (name)
    VALUES
        ('Marketing'),
        ('Finance and Investment'),
        ('Management Skills'),
        ('Business Management'),
        ('Tech'),
        ('Entrepreneurship'),
        ('Artificial Intelligence'),
        ('Crypto'),
        ('Sport'),
        ('Economy'),
        ('Networking'),
        ('Sustainability'),
        ('Design'),
        ('Music'),
        ('HR'),
        ('Employment')
"""
update_db(query)

'Database has been updated'

### Check "categories" table

In [16]:
query = "SELECT * FROM categories"
check_db(query)

Unnamed: 0,category_id,name
0,1,Marketing
1,2,Finance and Investment
2,3,Management Skills
3,4,Business Management
4,5,Tech
5,6,Entrepreneurship
6,7,Artificial Intelligence
7,8,Crypto
8,9,Sport
9,10,Economy


### Create "events" table

In [None]:
# update_db("DROP TABLE if exists events cascade")

In [None]:
# Create table
query = """
    CREATE TABLE events(
        event_id SERIAL PRIMARY KEY,
        title VARCHAR(100),
        description TEXT,
        date TIMESTAMP,
        category_id INTEGER,
        CONSTRAINT fk_categories FOREIGN KEY (category_id) REFERENCES categories(category_id)
    )
"""
update_db(query)

# Insert data
query = """
    INSERT INTO events (title, description, date, category_id)
    VALUES
        ('HR INTERNATIONAL SUMMER SCHOOL', 'empty', '2023-06-22 00:00:00', '2'),
        ('OPEN DAY BOOTCAMPS EDEM POWERED BY THE BRIDGE', 'empty', '2023-07-04 19:00:00', '3'),
        ('ENCUENTRO CON ERIC MASKIN, PREMIO NOBEL DE ECONOMÍA 2007', 'empty', '2023-06-05 11:30:00', '1')
"""
update_db(query)

### Check "events" table

In [17]:
query = "SELECT * FROM events"
check_db(query)

Unnamed: 0,event_id,title,description,date,category_id
0,1,HR INTERNATIONAL SUMMER SCHOOL,empty,2023-06-22 00:00:00,2
1,2,OPEN DAY BOOTCAMPS EDEM POWERED BY THE BRIDGE,empty,2023-07-04 19:00:00,3
2,3,"ENCUENTRO CON ERIC MASKIN, PREMIO NOBEL DE ECO...",empty,2023-06-05 11:30:00,1


### Create "students_df" dataframe first and then convert the dataframe into SQL table

Column "gender"

In [27]:
# Set the seed value
seed_value = 42

# Set the seed for random module
random.seed(seed_value)

male_count = 481
female_count = 253

genders = []
for _ in range(male_count):
    genders.append("male")

for _ in range(female_count):
    genders.append("female")

random.shuffle(genders)

students_df = pd.DataFrame(genders, columns=["gender"])
students_df

Unnamed: 0,gender
0,female
1,female
2,female
3,male
4,male
...,...
729,male
730,male
731,male
732,male


Column "name"

In [28]:
fake = Faker('es_ES')

names = []
surnames = []

for gen in students_df["gender"]:
    if gen == "male":
        names.append(fake.first_name_male())
        surnames.append(fake.last_name_male())
    elif gen == "female":
        names.append(fake.first_name_female())
        surnames.append(fake.last_name_female())

students_df["name"] = names
students_df["surname"] = surnames

students_df

Unnamed: 0,gender,name,surname
0,female,Zaira,Verdugo
1,female,Purificación,Llanos
2,female,Soledad,Palomares
3,male,Jesús,Alberola
4,male,Vidal,Baena
...,...,...,...
729,male,Guillermo,Guillén
730,male,Teodosio,Isern
731,male,Raúl,Mercader
732,male,Conrado,Oliva


Columns "programme" and "year"

In [29]:
prog_year = {
    ('BSc in Engineering and Management', '1'): 65, ('BSc in Engineering and Management', '2'): 42, ('BSc in Engineering and Management', '3'): 28, ('BSc in Engineering and Management', '4'): 29,
    ('BBA in Business Administration', '1'): 128, ('BBA in Business Administration', '2'): 92, ('BBA in Business Administration', '3'): 83, ('BBA in Business Administration', '4'): 72,
    ('Master Marketing and Digital Sales', '1'): 61, ('Master Marketing and Digital Sales', '2'): 0, ('Master Marketing and Digital Sales', '3'): 0, ('Master Marketing and Digital Sales', '4'): 0,
    ('Master Data Analytics', '1'): 30, ('Master Data Analytics', '2'): 0, ('Master Data Analytics', '3'): 0, ('Master Data Analytics', '4'): 0,
    ('Master Finance', '1'): 19, ('Master Finance', '2'): 0, ('Master Finance', '3'): 0, ('Master Finance', '4'): 0,
    ('MBA Junior', '1'): 41, ('MBA Junior', '2'): 0, ('MBA Junior', '3'): 0, ('MBA Junior', '4'): 0,
    ('Bootcamp Data Science', '1'): 17, ('Bootcamp Data Science', '2'): 0, ('Bootcamp Data Science', '3'): 0, ('Bootcamp Data Science', '4'): 0,
    ('Bootcamp Full Stack', '1'): 14, ('Bootcamp Full Stack', '2'): 0, ('Bootcamp Full Stack', '3'): 0, ('Bootcamp Full Stack', '4'): 0,
    ('Bootcamp UX/ UI', '1'): 7, ('Bootcamp UX/ UI', '2'): 0, ('Bootcamp UX/ UI', '3'): 0, ('Bootcamp UX/ UI', '4'): 0,
    ('Bootcamp Cybersecurity', '1'): 6, ('Bootcamp Cybersecurity', '2'): 0, ('Bootcamp Cybersecurity', '3'): 0, ('Bootcamp Cybersecurity', '4'): 0,
}

# Set the seed for random module
random.seed(seed_value)

students_df["prog_year"] = students_df.apply(
    lambda row: random.choices(list(prog_year.keys()), weights=list(prog_year.values()))[0],
    axis=1
)

# Create the "programme" column
students_df["programme"] = students_df["prog_year"].apply(lambda row: row[0])

# Create the "year_of_study" column
students_df["year"] = students_df["prog_year"].apply(lambda row: row[1])

students_df

Unnamed: 0,gender,name,surname,prog_year,programme,year
0,female,Zaira,Verdugo,"(BBA in Business Administration, 4)",BBA in Business Administration,4
1,female,Purificación,Llanos,"(BSc in Engineering and Management, 1)",BSc in Engineering and Management,1
2,female,Soledad,Palomares,"(BBA in Business Administration, 1)",BBA in Business Administration,1
3,male,Jesús,Alberola,"(BSc in Engineering and Management, 4)",BSc in Engineering and Management,4
4,male,Vidal,Baena,"(Master Marketing and Digital Sales, 1)",Master Marketing and Digital Sales,1
...,...,...,...,...,...,...
729,male,Guillermo,Guillén,"(Master Marketing and Digital Sales, 1)",Master Marketing and Digital Sales,1
730,male,Teodosio,Isern,"(BBA in Business Administration, 2)",BBA in Business Administration,2
731,male,Raúl,Mercader,"(BBA in Business Administration, 4)",BBA in Business Administration,4
732,male,Conrado,Oliva,"(MBA Junior, 1)",MBA Junior,1


In [30]:
# Delete "prog_year" column
students_df = students_df.drop("prog_year", axis=1)
students_df.head()

Unnamed: 0,gender,name,surname,programme,year
0,female,Zaira,Verdugo,BBA in Business Administration,4
1,female,Purificación,Llanos,BSc in Engineering and Management,1
2,female,Soledad,Palomares,BBA in Business Administration,1
3,male,Jesús,Alberola,BSc in Engineering and Management,4
4,male,Vidal,Baena,Master Marketing and Digital Sales,1


In [31]:
students_df.groupby("programme")["name"].count()

programme
BBA in Business Administration        373
BSc in Engineering and Management     167
Bootcamp Cybersecurity                 10
Bootcamp Data Science                  20
Bootcamp Full Stack                    12
Bootcamp UX/ UI                         7
MBA Junior                             31
Master Data Analytics                  29
Master Finance                         27
Master Marketing and Digital Sales     58
Name: name, dtype: int64

Column "email" (school_email)

In [32]:
# Generate school_email based on "first_name" and "last_name"
students_df["email"] = (students_df["name"].str.lower().str.replace(" ", "") +
                     students_df["surname"].str.lower().str.replace(" ", "") +
                     "@edem.es")

students_df

Unnamed: 0,gender,name,surname,programme,year,email
0,female,Zaira,Verdugo,BBA in Business Administration,4,zairaverdugo@edem.es
1,female,Purificación,Llanos,BSc in Engineering and Management,1,purificaciónllanos@edem.es
2,female,Soledad,Palomares,BBA in Business Administration,1,soledadpalomares@edem.es
3,male,Jesús,Alberola,BSc in Engineering and Management,4,jesúsalberola@edem.es
4,male,Vidal,Baena,Master Marketing and Digital Sales,1,vidalbaena@edem.es
...,...,...,...,...,...,...
729,male,Guillermo,Guillén,Master Marketing and Digital Sales,1,guillermoguillén@edem.es
730,male,Teodosio,Isern,BBA in Business Administration,2,teodosioisern@edem.es
731,male,Raúl,Mercader,BBA in Business Administration,4,raúlmercader@edem.es
732,male,Conrado,Oliva,MBA Junior,1,conradooliva@edem.es


Column "age"

In [33]:
# Set the seed for the random module
random.seed(seed_value)

age_range_1 = [18, 19, 20, 21, 22]
age_range_2 = [23, 24, 25, 26, 27]
age_range_3 = [x for x in range(20, 51)]

mask_1_2 = students_df["programme"].isin(['BSc in Engineering and Management', 'BBA in Business Administration'])
mask_3_to_7 = students_df["programme"].isin(['Master Marketing and Digital Sales', 'Master Data Analytics', 'Master Finance', 'MBA Junior'])
mask_8_to_13 = students_df["programme"].isin(['Bootcamp Data Science', 'Bootcamp Full Stack', 'Bootcamp UX/ UI', 'Bootcamp Cybersecurity'])

students_df.loc[mask_1_2, "age"] = random.choices(age_range_1, weights=[5, 3, 1, 1, 0], k=mask_1_2.sum())
students_df.loc[mask_3_to_7, "age"] = random.choices(age_range_2, weights=[2, 2, 2, 2, 2], k=mask_3_to_7.sum())

weights_3_to_13 = [1 / (x - 19) for x in range(20, 51)]
students_df.loc[mask_8_to_13, "age"] = random.choices(age_range_3, weights=weights_3_to_13, k=mask_8_to_13.sum())

students_df

Unnamed: 0,gender,name,surname,programme,year,email,age
0,female,Zaira,Verdugo,BBA in Business Administration,4,zairaverdugo@edem.es,19.0
1,female,Purificación,Llanos,BSc in Engineering and Management,1,purificaciónllanos@edem.es,18.0
2,female,Soledad,Palomares,BBA in Business Administration,1,soledadpalomares@edem.es,18.0
3,male,Jesús,Alberola,BSc in Engineering and Management,4,jesúsalberola@edem.es,18.0
4,male,Vidal,Baena,Master Marketing and Digital Sales,1,vidalbaena@edem.es,27.0
...,...,...,...,...,...,...,...
729,male,Guillermo,Guillén,Master Marketing and Digital Sales,1,guillermoguillén@edem.es,23.0
730,male,Teodosio,Isern,BBA in Business Administration,2,teodosioisern@edem.es,19.0
731,male,Raúl,Mercader,BBA in Business Administration,4,raúlmercader@edem.es,19.0
732,male,Conrado,Oliva,MBA Junior,1,conradooliva@edem.es,23.0


In [34]:
# Change "age" column dtype
students_df = students_df.astype({"age": "int64"})

students_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   gender     734 non-null    object
 1   name       734 non-null    object
 2   surname    734 non-null    object
 3   programme  734 non-null    object
 4   year       734 non-null    object
 5   email      734 non-null    object
 6   age        734 non-null    int64 
dtypes: int64(1), object(6)
memory usage: 40.3+ KB


Arrange the "students_df" columns

In [35]:
students_df.columns

Index(['gender', 'name', 'surname', 'programme', 'year', 'email', 'age'], dtype='object')

In [36]:
students_df = students_df[["name", "surname", "gender", "age", "year", "email", "programme"]]
students_df

Unnamed: 0,name,surname,gender,age,year,email,programme
0,Zaira,Verdugo,female,19,4,zairaverdugo@edem.es,BBA in Business Administration
1,Purificación,Llanos,female,18,1,purificaciónllanos@edem.es,BSc in Engineering and Management
2,Soledad,Palomares,female,18,1,soledadpalomares@edem.es,BBA in Business Administration
3,Jesús,Alberola,male,18,4,jesúsalberola@edem.es,BSc in Engineering and Management
4,Vidal,Baena,male,27,1,vidalbaena@edem.es,Master Marketing and Digital Sales
...,...,...,...,...,...,...,...
729,Guillermo,Guillén,male,23,1,guillermoguillén@edem.es,Master Marketing and Digital Sales
730,Teodosio,Isern,male,19,2,teodosioisern@edem.es,BBA in Business Administration
731,Raúl,Mercader,male,19,4,raúlmercader@edem.es,BBA in Business Administration
732,Conrado,Oliva,male,23,1,conradooliva@edem.es,MBA Junior


Double check whether the "users_df" data is aligned with the data provided by EDEM

In [37]:
students_df.groupby("programme")["email"].count()

programme
BBA in Business Administration        373
BSc in Engineering and Management     167
Bootcamp Cybersecurity                 10
Bootcamp Data Science                  20
Bootcamp Full Stack                    12
Bootcamp UX/ UI                         7
MBA Junior                             31
Master Data Analytics                  29
Master Finance                         27
Master Marketing and Digital Sales     58
Name: email, dtype: int64

In [38]:
students_df.groupby(["programme", "year"])["email"].count()

programme                           year
BBA in Business Administration      1       122
                                    2        89
                                    3        85
                                    4        77
BSc in Engineering and Management   1        64
                                    2        40
                                    3        24
                                    4        39
Bootcamp Cybersecurity              1        10
Bootcamp Data Science               1        20
Bootcamp Full Stack                 1        12
Bootcamp UX/ UI                     1         7
MBA Junior                          1        31
Master Data Analytics               1        29
Master Finance                      1        27
Master Marketing and Digital Sales  1        58
Name: email, dtype: int64

In [39]:
students_df.groupby("gender")["email"].count()

gender
female    253
male      481
Name: email, dtype: int64

In [40]:
students_df.groupby(["programme", "year", "gender"])["email"].count()

programme                           year  gender
BBA in Business Administration      1     female    32
                                          male      90
                                    2     female    30
                                          male      59
                                    3     female    29
                                          male      56
                                    4     female    30
                                          male      47
BSc in Engineering and Management   1     female    24
                                          male      40
                                    2     female    14
                                          male      26
                                    3     female     9
                                          male      15
                                    4     female    13
                                          male      26
Bootcamp Cybersecurity              1     female     2
                

Comment:
- The number of students per gender that was randomly generated, is the same as the one provided by EDEM, but when we tried to break it down based on the programmes/majors and year of study, there is a slight difference in the figures generated randomly (distribution of the data). 
- Overall, the random data is quite aligned with the data provided by EDEM. 

Create "category_id" column

In [41]:
query = "SELECT * FROM categories"
category_id_df = check_db(query)
category_id_df

Unnamed: 0,category_id,name
0,1,Marketing
1,2,Finance and Investment
2,3,Management Skills
3,4,Business Management
4,5,Tech
5,6,Entrepreneurship
6,7,Artificial Intelligence
7,8,Crypto
8,9,Sport
9,10,Economy


In [42]:
category_id_df["category_id"].to_list()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [43]:
interests = category_id_df["category_id"].to_list()

random.seed(seed_value)

def generate_categories(row):
    num_categories = random.randint(1, len(interests))
    categories = random.sample(interests, k=num_categories)
    return categories

students_df["category_id"] = students_df.apply(generate_categories, axis=1)

students_df

Unnamed: 0,name,surname,gender,age,year,email,programme,category_id
0,Zaira,Verdugo,female,19,4,zairaverdugo@edem.es,BBA in Business Administration,"[1, 12, 5, 4]"
1,Purificación,Llanos,female,18,1,purificaciónllanos@edem.es,BSc in Engineering and Management,"[5, 12, 2, 11, 15, 9, 14, 7]"
2,Soledad,Palomares,female,18,1,soledadpalomares@edem.es,BBA in Business Administration,"[1, 2]"
3,Jesús,Alberola,male,18,4,jesúsalberola@edem.es,BSc in Engineering and Management,"[8, 9, 10, 1, 15, 4, 12]"
4,Vidal,Baena,male,27,1,vidalbaena@edem.es,Master Marketing and Digital Sales,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]"
...,...,...,...,...,...,...,...,...
729,Guillermo,Guillén,male,23,1,guillermoguillén@edem.es,Master Marketing and Digital Sales,"[16, 12, 2, 3]"
730,Teodosio,Isern,male,19,2,teodosioisern@edem.es,BBA in Business Administration,"[15, 9, 2, 16, 10, 14, 4, 8, 11]"
731,Raúl,Mercader,male,19,4,raúlmercader@edem.es,BBA in Business Administration,"[2, 15, 6, 9, 1, 8, 11, 13, 10, 5]"
732,Conrado,Oliva,male,23,1,conradooliva@edem.es,MBA Junior,"[14, 6, 13, 5, 16, 3, 1, 12, 9, 10, 15]"


In [44]:
# Check whether or not there is a repetition in category list per each row
def check_category_repetition(row):
    categories = row["category_id"]
    return len(categories) != len(set(categories))

students_df["repeated_categories"] = students_df.apply(check_category_repetition, axis=1)

students_df

Unnamed: 0,name,surname,gender,age,year,email,programme,category_id,repeated_categories
0,Zaira,Verdugo,female,19,4,zairaverdugo@edem.es,BBA in Business Administration,"[1, 12, 5, 4]",False
1,Purificación,Llanos,female,18,1,purificaciónllanos@edem.es,BSc in Engineering and Management,"[5, 12, 2, 11, 15, 9, 14, 7]",False
2,Soledad,Palomares,female,18,1,soledadpalomares@edem.es,BBA in Business Administration,"[1, 2]",False
3,Jesús,Alberola,male,18,4,jesúsalberola@edem.es,BSc in Engineering and Management,"[8, 9, 10, 1, 15, 4, 12]",False
4,Vidal,Baena,male,27,1,vidalbaena@edem.es,Master Marketing and Digital Sales,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]",False
...,...,...,...,...,...,...,...,...,...
729,Guillermo,Guillén,male,23,1,guillermoguillén@edem.es,Master Marketing and Digital Sales,"[16, 12, 2, 3]",False
730,Teodosio,Isern,male,19,2,teodosioisern@edem.es,BBA in Business Administration,"[15, 9, 2, 16, 10, 14, 4, 8, 11]",False
731,Raúl,Mercader,male,19,4,raúlmercader@edem.es,BBA in Business Administration,"[2, 15, 6, 9, 1, 8, 11, 13, 10, 5]",False
732,Conrado,Oliva,male,23,1,conradooliva@edem.es,MBA Junior,"[14, 6, 13, 5, 16, 3, 1, 12, 9, 10, 15]",False


In [45]:
students_df["repeated_categories"].unique() 

array([False])

In [46]:
# Remove the "repeated_categories" column
students_df = students_df.drop("repeated_categories", axis=1)

In [47]:
students_df

Unnamed: 0,name,surname,gender,age,year,email,programme,category_id
0,Zaira,Verdugo,female,19,4,zairaverdugo@edem.es,BBA in Business Administration,"[1, 12, 5, 4]"
1,Purificación,Llanos,female,18,1,purificaciónllanos@edem.es,BSc in Engineering and Management,"[5, 12, 2, 11, 15, 9, 14, 7]"
2,Soledad,Palomares,female,18,1,soledadpalomares@edem.es,BBA in Business Administration,"[1, 2]"
3,Jesús,Alberola,male,18,4,jesúsalberola@edem.es,BSc in Engineering and Management,"[8, 9, 10, 1, 15, 4, 12]"
4,Vidal,Baena,male,27,1,vidalbaena@edem.es,Master Marketing and Digital Sales,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]"
...,...,...,...,...,...,...,...,...
729,Guillermo,Guillén,male,23,1,guillermoguillén@edem.es,Master Marketing and Digital Sales,"[16, 12, 2, 3]"
730,Teodosio,Isern,male,19,2,teodosioisern@edem.es,BBA in Business Administration,"[15, 9, 2, 16, 10, 14, 4, 8, 11]"
731,Raúl,Mercader,male,19,4,raúlmercader@edem.es,BBA in Business Administration,"[2, 15, 6, 9, 1, 8, 11, 13, 10, 5]"
732,Conrado,Oliva,male,23,1,conradooliva@edem.es,MBA Junior,"[14, 6, 13, 5, 16, 3, 1, 12, 9, 10, 15]"


In [48]:
# Test category mapping
category_map = list(category_id_df.set_index("category_id").to_dict().values())[0]
category_map

{1: 'Marketing',
 2: 'Finance and Investment',
 3: 'Management Skills',
 4: 'Business Management',
 5: 'Tech',
 6: 'Entrepreneurship',
 7: 'Artificial Intelligence',
 8: 'Crypto',
 9: 'Sport',
 10: 'Economy',
 11: 'Networking',
 12: 'Sustainability',
 13: 'Design',
 14: 'Music',
 15: 'HR',
 16: 'Employment'}

In [49]:
students_df['category'] = students_df['category_id'].apply(lambda x: [category_map.get(category) for category in x])
students_df

Unnamed: 0,name,surname,gender,age,year,email,programme,category_id,category
0,Zaira,Verdugo,female,19,4,zairaverdugo@edem.es,BBA in Business Administration,"[1, 12, 5, 4]","[Marketing, Sustainability, Tech, Business Man..."
1,Purificación,Llanos,female,18,1,purificaciónllanos@edem.es,BSc in Engineering and Management,"[5, 12, 2, 11, 15, 9, 14, 7]","[Tech, Sustainability, Finance and Investment,..."
2,Soledad,Palomares,female,18,1,soledadpalomares@edem.es,BBA in Business Administration,"[1, 2]","[Marketing, Finance and Investment]"
3,Jesús,Alberola,male,18,4,jesúsalberola@edem.es,BSc in Engineering and Management,"[8, 9, 10, 1, 15, 4, 12]","[Crypto, Sport, Economy, Marketing, HR, Busine..."
4,Vidal,Baena,male,27,1,vidalbaena@edem.es,Master Marketing and Digital Sales,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]","[Crypto, Employment, Economy, Tech, Marketing,..."
...,...,...,...,...,...,...,...,...,...
729,Guillermo,Guillén,male,23,1,guillermoguillén@edem.es,Master Marketing and Digital Sales,"[16, 12, 2, 3]","[Employment, Sustainability, Finance and Inves..."
730,Teodosio,Isern,male,19,2,teodosioisern@edem.es,BBA in Business Administration,"[15, 9, 2, 16, 10, 14, 4, 8, 11]","[HR, Sport, Finance and Investment, Employment..."
731,Raúl,Mercader,male,19,4,raúlmercader@edem.es,BBA in Business Administration,"[2, 15, 6, 9, 1, 8, 11, 13, 10, 5]","[Finance and Investment, HR, Entrepreneurship,..."
732,Conrado,Oliva,male,23,1,conradooliva@edem.es,MBA Junior,"[14, 6, 13, 5, 16, 3, 1, 12, 9, 10, 15]","[Music, Entrepreneurship, Design, Tech, Employ..."


Create "student_id" column

In [50]:
students_df.columns

Index(['name', 'surname', 'gender', 'age', 'year', 'email', 'programme',
       'category_id', 'category'],
      dtype='object')

In [51]:
# Add "student_id" column into "students_df" dataframe. This column will be a primary key in SQL and will be filled in automatically in SQL
students_df["student_id"] = [x for x in range (1, 735)]

# Put the "student_id" in the first column
students_df = students_df[['student_id', 'email', 'name', 'surname', 'gender', 'age', 'programme', 'year', 'category_id', 
                           'category']]

students_df.head()

Unnamed: 0,student_id,email,name,surname,gender,age,programme,year,category_id,category
0,1,zairaverdugo@edem.es,Zaira,Verdugo,female,19,BBA in Business Administration,4,"[1, 12, 5, 4]","[Marketing, Sustainability, Tech, Business Man..."
1,2,purificaciónllanos@edem.es,Purificación,Llanos,female,18,BSc in Engineering and Management,1,"[5, 12, 2, 11, 15, 9, 14, 7]","[Tech, Sustainability, Finance and Investment,..."
2,3,soledadpalomares@edem.es,Soledad,Palomares,female,18,BBA in Business Administration,1,"[1, 2]","[Marketing, Finance and Investment]"
3,4,jesúsalberola@edem.es,Jesús,Alberola,male,18,BSc in Engineering and Management,4,"[8, 9, 10, 1, 15, 4, 12]","[Crypto, Sport, Economy, Marketing, HR, Busine..."
4,5,vidalbaena@edem.es,Vidal,Baena,male,27,Master Marketing and Digital Sales,1,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]","[Crypto, Employment, Economy, Tech, Marketing,..."


In [67]:
students_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   student_id   734 non-null    int64 
 1   email        734 non-null    object
 2   name         734 non-null    object
 3   surname      734 non-null    object
 4   gender       734 non-null    object
 5   age          734 non-null    int64 
 6   programme    734 non-null    object
 7   year         734 non-null    object
 8   category_id  734 non-null    object
 9   category     734 non-null    object
dtypes: int64(2), object(8)
memory usage: 57.5+ KB


In [68]:
# Save the dataframe in CSV for ML training purpose
students_df.to_csv("data_ds/students_df.csv")

Convert the "students_df" dataframe into "students" table in PostgreSQL

In [58]:
# update_db("DROP TABLE if exists students cascade")

'Database has been updated'

In [59]:
# Create table
query = """
	CREATE TABLE students(
		student_id SERIAL PRIMARY KEY,
        email VARCHAR(255),
		name VARCHAR(255),
		surname VARCHAR(255),
		gender VARCHAR(10),
		age INTEGER,
		programme VARCHAR(255),
        year VARCHAR(20),
        category_id INTEGER[],
        category TEXT[]
	)
"""
update_db(query)

'Database has been updated'

In [60]:
students_df.columns

Index(['student_id', 'email', 'name', 'surname', 'gender', 'age', 'programme',
       'year', 'category_id', 'category'],
      dtype='object')

In [61]:
def insert_users_db(query, values):
    conn = psycopg2.connect(
        host=host,
        user=user,
        password=password,
        port=port
        )

    cur = conn.cursor()

    cur.execute(query, values)

    conn.commit()
    cur.close()
    conn.close()

    return "Database has been updated"

# Iterate over each row in the DataFrame and insert data into the "students" table
for index, row in students_df.iterrows():
    student_id = row['student_id']
    email = row['email']
    name = row['name']
    surname = row['surname']
    gender = row['gender']
    age = row['age']
    programme = row['programme']
    year = row['year']
    category_id = row['category_id']
    category = row['category']

    # Generate the SQL query with placeholders for parameters
    query = "INSERT INTO students (student_id, email, name, surname, gender, age, programme, year, category_id, category) " \
            "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

    # Create a tuple of values to be inserted into the query
    values = (student_id, email, name, surname, gender, age, programme, year, category_id, category)

    # Call the update_db function to execute the query with the parameter values
    insert_users_db(query, values)

print("Data insertion into the 'students' table is complete.")

Data insertion into the 'students' table is complete.


In [69]:
check_db("SELECT * FROM students")

Unnamed: 0,student_id,email,name,surname,gender,age,programme,year,category_id,category
0,1,zairaverdugo@edem.es,Zaira,Verdugo,female,19,BBA in Business Administration,4,"[1, 12, 5, 4]","[Marketing, Sustainability, Tech, Business Man..."
1,2,purificaciónllanos@edem.es,Purificación,Llanos,female,18,BSc in Engineering and Management,1,"[5, 12, 2, 11, 15, 9, 14, 7]","[Tech, Sustainability, Finance and Investment,..."
2,3,soledadpalomares@edem.es,Soledad,Palomares,female,18,BBA in Business Administration,1,"[1, 2]","[Marketing, Finance and Investment]"
3,4,jesúsalberola@edem.es,Jesús,Alberola,male,18,BSc in Engineering and Management,4,"[8, 9, 10, 1, 15, 4, 12]","[Crypto, Sport, Economy, Marketing, HR, Busine..."
4,5,vidalbaena@edem.es,Vidal,Baena,male,27,Master Marketing and Digital Sales,1,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]","[Crypto, Employment, Economy, Tech, Marketing,..."
...,...,...,...,...,...,...,...,...,...,...
729,730,guillermoguillén@edem.es,Guillermo,Guillén,male,23,Master Marketing and Digital Sales,1,"[16, 12, 2, 3]","[Employment, Sustainability, Finance and Inves..."
730,731,teodosioisern@edem.es,Teodosio,Isern,male,19,BBA in Business Administration,2,"[15, 9, 2, 16, 10, 14, 4, 8, 11]","[HR, Sport, Finance and Investment, Employment..."
731,732,raúlmercader@edem.es,Raúl,Mercader,male,19,BBA in Business Administration,4,"[2, 15, 6, 9, 1, 8, 11, 13, 10, 5]","[Finance and Investment, HR, Entrepreneurship,..."
732,733,conradooliva@edem.es,Conrado,Oliva,male,23,MBA Junior,1,"[14, 6, 13, 5, 16, 3, 1, 12, 9, 10, 15]","[Music, Entrepreneurship, Design, Tech, Employ..."


In [73]:
# # Filter top 5 students
# check_db("SELECT * FROM students ORDER BY student_id LIMIT 5")