# Pandas â€“ Part 1: Data Loading & Initial Exploration

Objective:
- Load datasets
- Inspect schema
- Validate data types
- Identify initial analytical questions


In [1]:
#Imports
import pandas as pd


In [2]:
pd.set_option("display.max_columns", None)


In [None]:
# Load Data
employees = pd.read_csv("../data/employees.csv")
departments = pd.read_csv("../data/departments.csv")
sales = pd.read_csv("../data/sales.csv")


In [5]:
# Quick Preview
employees.head()


Unnamed: 0,emp_id,name,department,salary,joining_date
0,1,Amit,Engineering,80000,2021-06-15
1,2,Neha,Engineering,75000,2022-03-10
2,3,Ravi,HR,50000,2020-01-20
3,4,Pooja,HR,55000,2021-11-05
4,5,Karan,Sales,60000,2023-02-01


In [6]:
sales.head()

Unnamed: 0,sale_id,emp_id,amount,sale_date,region
0,1,1,12000,2023-01-10,North
1,2,2,15000,2023-01-12,East
2,3,5,8000,2023-02-01,West
3,4,1,7000,2023-02-05,North
4,5,3,4000,2023-02-10,South


Observations:
- Columns appear correctly loaded
- Dates currently appear as object type
- Department column is categorical

In [10]:
#Shape of Data

print(employees.shape)
print(sales.shape)


(20, 5)
(20, 5)


In [11]:
# Schema & Data Types
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   emp_id        20 non-null     int64 
 1   name          20 non-null     object
 2   department    20 non-null     object
 3   salary        20 non-null     int64 
 4   joining_date  20 non-null     object
dtypes: int64(2), object(3)
memory usage: 928.0+ bytes


In [12]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sale_id    20 non-null     int64 
 1   emp_id     20 non-null     int64 
 2   amount     20 non-null     int64 
 3   sale_date  20 non-null     object
 4   region     20 non-null     object
dtypes: int64(3), object(2)
memory usage: 928.0+ bytes


Observations:
- joining_date is object (needs conversion)
- sale_date is object (needs conversion)
- No missing values observed


In [14]:
#Convert Date Columns
employees["joining_date"] = pd.to_datetime(employees["joining_date"])
sales["sale_date"] = pd.to_datetime(sales["sale_date"])


In [15]:
print(employees.info())
print(sales.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   emp_id        20 non-null     int64         
 1   name          20 non-null     object        
 2   department    20 non-null     object        
 3   salary        20 non-null     int64         
 4   joining_date  20 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 928.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   sale_id    20 non-null     int64         
 1   emp_id     20 non-null     int64         
 2   amount     20 non-null     int64         
 3   sale_date  20 non-null     datetime64[ns]
 4   region     20 non-null     object        
dtypes: datetime64[ns](1), int64(3), o

Dates successfully converted to datetime64.


In [16]:
#Descriptive Statistics
employees.describe()

Unnamed: 0,emp_id,salary
count,20.0,20.0
mean,10.5,67850.0
std,5.91608,13168.043455
min,1.0,50000.0
25%,5.75,57250.0
50%,10.5,66000.0
75%,15.25,78500.0
max,20.0,90000.0


In [18]:
sales.describe()

Unnamed: 0,sale_id,emp_id,amount
count,20.0,20.0,20.0
mean,10.5,8.35,12825.0
std,5.91608,6.158221,4950.478446
min,1.0,1.0,4000.0
25%,5.75,2.75,8875.0
50%,10.5,7.0,12500.0
75%,15.25,12.25,16375.0
max,20.0,20.0,22000.0


In [21]:
# Unique Values (Categorical Inspection)

print(employees["department"].unique())
print(sales["region"].unique())

['Engineering' 'HR' 'Sales' 'Finance']
['North' 'East' 'West' 'South']
