# Step 1: Explore and Collect the Dataset (Week 1)

Download and understand the job market dataset and prepare it for further processing.

Tasks:
Download Dataset:

Download the dataset from Kaggle.
Explore the CSV file to understand its structure: job title, company, location, job description, post date, etc.
Understand the Data:

Examine key features such as:
Job Title: What roles are being posted?
Company: Which companies are posting the most jobs?
Location: Where are most jobs located?
Job Function: What are the most common job functions (e.g., IT, HR, Marketing)?
Employment Type: Full-time, part-time, contract, etc.
Skills Learned:
Exploratory Data Analysis (EDA): Learn how to understand datasets before performing operations.
Technologies:
Pandas (for data exploration in Python).

In [4]:
import pandas as pd 

chunk_size = 10000

chunks = [ ]

for index, chunk in enumerate(pd.read_csv('data/postings.csv', chunksize=chunk_size)): 
    # print(f"reading chunk {index}")
    # print(f"shape is  {chunk.shape}")
    chunks.append(chunk)
    
    
df = pd.concat(chunks, ignore_index=True)
    
# df.head()
# df.info()
# df.describe()
# df[['views','job_id']].head(10)

# df.head(5)

# print(df.iloc[0])
# print(df.loc[0])


# df[(df['min_salary'] > 35) & (df['pay_period'] == 'HOURLY')]

# df.groupby('pay_period').sum()

df.head(1)

df['title'].max()

'🚀Principal Geotechnical Engineer (ESOP)(Bonus)(Fully Remote)🚀'

In [7]:
import pandas as pd
from sqlalchemy import create_engine,  Column, Integer, String, DateTime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import  declarative_base
import os
from dotenv import load_dotenv


# Define the base class for SQLAlchemy models
Base = declarative_base()

# Define your table as a model
class Posting(Base):
    __tablename__ = 'postings'  # Table name in the database
    id = Column(Integer, primary_key=True)  # Primary key
    name = Column(String, nullable=False)     # User name
    age = Column(Integer)                      # User age
    created_at = Column(DateTime)             # Record creation timestamp

def create_table(engine):
    # Create all tables defined in the Base metadata
    Base.metadata.create_all(engine)
    print("Table created successfully or already exists.")

def refresh_env_variables():
    """Refreshes the environment variables by reloading them from the .env file."""
    load_dotenv()  # Load the environment variables from the .env file

    # Optionally, you can clear specific variables if you want to ensure they are refreshed
    os.environ.pop('DB_USERNAME', None)
    os.environ.pop('DB_PASSWORD', None)
    os.environ.pop('DB_NAME', None)
    os.environ.pop('DB_HOST', None)
    os.environ.pop('DB_PORT', None)

    # Reload the environment variables
    load_dotenv()

    print("Environment variables refreshed.")
    
def setup_database_connection():

    # Retrieve PostgreSQL connection parameters from environment variables
    username = os.getenv('DB_USERNAME')
    password = os.getenv('DB_PASSWORD')
    database = os.getenv('DB_NAME')
    host = os.getenv('DB_HOST', 'localhost')  # Default to 'localhost' if not set
    port = os.getenv('DB_PORT', '5432')        # Default to '5432' if not set

    # Check if required environment variables are set
    if not username or not password or not database:
        raise ValueError("Database connection parameters are not set in the environment variables.")

    # Create a connection string
    connection_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

    print(connection_string)
    # Create a SQLAlchemy engine
    engine = create_engine(connection_string)

    # Check the database connection
    try:
        with engine.connect() as connection:
            print("Database connection successful!")
    except SQLAlchemyError as e:
        print(f"Database connection failed: {e}")
        return None  # Return None if the connection fails

    return engine  # Return the engine if the connection is successful





def transfer_data_to_postgres(engine):
    # Read the CSV file into a pandas DataFrame
    csv_file_path = 'path/to/your/file.csv'  # Update this path
    df = pd.read_csv(csv_file_path)

    # Transfer the data to PostgreSQL
    try:
        df.to_sql('your_table_name', engine, if_exists='append', index=False)
        print("Data transferred successfully!")
    except SQLAlchemyError as e:
        print(f"Data transfer failed: {e}")

def read_data_from_table(engine, table_name):
    """Read all data from the specified table."""
    try:
        # Read data from the specified table into a pandas DataFrame
        df = pd.read_sql_table(table_name, con=engine)
        print(f"Data retrieved successfully from table '{table_name}':")
        print(df)
    except SQLAlchemyError as e:
        print(f"Failed to read data from table '{table_name}': {e}")
        
        
def main():
    engine = setup_database_connection()
    if engine:
        create_table(engine)
        read_data_from_table(engine, 'postings')

if __name__ == "__main__":
    refresh_env_variables()
    main()


Environment variables refreshed.
postgresql://postgres:root@localhost:5432/data_engineering_job_postings
Connected to database: data_engineering_job_postings
Database connection successful!
Table created successfully or already exists.
Data retrieved successfully from table 'postings':
Empty DataFrame
Columns: [id, name, age, created_at]
Index: []
