In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print(np.__version__)
print(pd.__version__)

1.26.4
2.2.2


# Pandas
## Table of Contents:
- Series
- DataFrame
- Create a Series from Dictionary Elements
- Create a DataFrame from a Dictionary of Lists
- Create a DataFrame from a List of Dictionaries
- Reading data from .csv (Excel) file
- Accessing Data from DataFrame
- Accessing a specific element
- Data manipulation with DataFrames
- Statistical Methods

## Series
- A Pandas Series is a one-dimensional, labeled array capable of holding data of any type (integers, floats, strings, etc.).
- One-dimensional labeled array.
- Can hold data of any type (integers, floats, strings, etc.).
- Consists of two main components: values and index.
- Each element is associated with a unique index label.
- Supports fast and efficient element access and slicing.
- More flexible than NumPy arrays due to labeled indexing.
- Often used to represent a single column in a DataFrame.
- Supports vectorized operations and alignment by index.
- Automatically handles missing data (e.g., NaN values).

## DataFrame
- Two-dimensional, size-mutable, and heterogeneous data structure.
- Consists of rows and columns, similar to a table or spreadsheet.
- Each column is a Pandas Series with its own data type and label.
- Labeled axes: row index (axis=0) and column labels (axis=1).
- Can hold different data types in different columns (e.g., int, float, string).
- Supports rich data manipulation: filtering, sorting, grouping, aggregation, etc.
- Offers built-in handling for missing data using NaN.
- Allows easy data import/export from formats like CSV, Excel, SQL, JSON, etc.
- Enables vectorized operations and broadcasting for high performance.
- Ideal for structured data analysis, cleaning, and transformation.

## Pandas Series

### Creating a Pandas Series 

In [5]:
data = [1,2,3,4,5]
series = pd.Series(data)
print("Series:\n",series) # Gives index along with the element
print(type(series))

Series:
 0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


### Create a Series from Dictionary Elements

In [63]:
data = {'a':1 , 'b':2, 'c':3}
series_dict = pd.Series(data)
print(series_dict)

a    1
b    2
c    3
dtype: int64


In [7]:
data = [10, 20, 30]
index = ['a','b','c']
pd.Series(data, index = index)

a    10
b    20
c    30
dtype: int64

## DataFrame

### Create a DataFrame from a dictionary of Lists

In [9]:
data = {
    'Name':['Krish','Shubh','Jack'],
    'Age':[42,23,25],
    'City':['Bangalore','Singapore','NYC']
}
df = pd.DataFrame(data)
print(df)
print(type(df))

    Name  Age       City
0  Krish   42  Bangalore
1  Shubh   23  Singapore
2   Jack   25        NYC
<class 'pandas.core.frame.DataFrame'>


### Convert the DataFrame into a NumPy array

In [10]:
np.array(df)

array([['Krish', 42, 'Bangalore'],
       ['Shubh', 23, 'Singapore'],
       ['Jack', 25, 'NYC']], dtype=object)

### Create a DataFrame from a List of Dictionaries

In [12]:
data = [
    {'Name':'Shubh','Age':23,'City':'Singapore'},
    {'Name':'Krish','Age':24,'City':'Singa'},
    {'Name':'Jack','Age':25,'City':'Sing'},
    {'Name':'John','Age':26,'City':'Sin'}
]
df = pd.DataFrame(data)
print(df)
print(type(df))

    Name  Age       City
0  Shubh   23  Singapore
1  Krish   24      Singa
2   Jack   25       Sing
3   John   26        Sin
<class 'pandas.core.frame.DataFrame'>


### Reading Data from a .csv (Excel) file
- The csv file is in the same folder as this (.ipynb file) Jupyter Notebook
- .head(x) = gets the first 'x' rows
- .tail(x) = gets the last 'x' rows

In [13]:
df = pd.read_csv('data.csv')
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [14]:
df.tail(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


### Accessing Data from DataFrame

In [15]:
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North
5,2023-01-06,B,54.0,Product3,192.0,West
6,2023-01-07,A,16.0,Product1,936.0,East
7,2023-01-08,C,89.0,Product1,488.0,West
8,2023-01-09,C,37.0,Product3,772.0,West
9,2023-01-10,A,22.0,Product2,834.0,West


In [18]:
data = [
    {'Name':'Shubh','Age':23,'City':'Singapore'},
    {'Name':'Krish','Age':24,'City':'Bangalore'},
    {'Name':'Jack','Age':25,'City':'NYC'},
    {'Name':'John','Age':26,'City':'London'}
]
df = pd.DataFrame(data)
print(df)

    Name  Age       City
0  Shubh   23  Singapore
1  Krish   24  Bangalore
2   Jack   25        NYC
3   John   26     London


In [19]:
df

Unnamed: 0,Name,Age,City
0,Shubh,23,Singapore
1,Krish,24,Bangalore
2,Jack,25,NYC
3,John,26,London


### Fetches the entire column from the DataFrame
- Picking only 1 column from the DataFrame makes it a Series

In [22]:
df['Name']

0    Shubh
1    Krish
2     Jack
3     John
Name: Name, dtype: object

In [24]:
type(df['Name']) # Picking 1 column makes it a series

pandas.core.series.Series

### DataFrame Operations
- loc[x] : Row Index Operation
- iloc[x] : Column Index Operation
- loc[x][y] : Row and Column index to access element
- iloc[x][y] : # Row and Column index to access element

In [25]:
df.loc[0] # Row Index Operation

Name        Shubh
Age            23
City    Singapore
Name: 0, dtype: object

In [26]:
df.iloc[0] # Column Index Operation

Name        Shubh
Age            23
City    Singapore
Name: 0, dtype: object

In [27]:
df.loc[0][0] # Row and Column index to access element

  df.loc[0][0]


'Shubh'

In [29]:
df.iloc[0][1] # Row and Column index to access element

  df.iloc[0][1] # Row and Column index to access element


23

In [31]:
df.iloc[0][2] # Row and Column index to access element

  df.iloc[0][2] # Row and Column index to access element


'Singapore'

### Accessing a specific element 
- using .at[ ]
- .at[Column-Index , Column-Name]

In [32]:
df

Unnamed: 0,Name,Age,City
0,Shubh,23,Singapore
1,Krish,24,Bangalore
2,Jack,25,NYC
3,John,26,London


In [33]:
df.at[1,'Age']

24

In [34]:
df.at[0,'Age']

23

In [35]:
df.at[0,'City']

'Singapore'

In [36]:
df.at[0,'Name']

'Shubh'

### Accessing a specific element 
- using .iat()
- User can give Row and Column Index

In [39]:
df

Unnamed: 0,Name,Age,City
0,Shubh,23,Singapore
1,Krish,24,Bangalore
2,Jack,25,NYC
3,John,26,London


In [38]:
df.iat[0,0]

'Shubh'

In [41]:
df.iat[0,2]

'Singapore'

In [42]:
df.iat[2,2]

'NYC'

### Data Manipulation with DataFrames
- Adding a Column
- Removing a Column
- By default, axis = 0, only checks for Rows, not Columns
- To get access to columns, axis = 1
- To make changes permanent (save state), inplace = True
- df[Column-Name] = df[Column-Name] + 1

In [43]:
df['Salary'] = [150000, 125000, 100000, 75000]
df

Unnamed: 0,Name,Age,City,Salary
0,Shubh,23,Singapore,150000
1,Krish,24,Bangalore,125000
2,Jack,25,NYC,100000
3,John,26,London,75000


In [44]:
# By default, axis = 0, only checks for Rows, not Columns
# This will throw an error if executed

df.drop('Salary', axis = 0)

KeyError: "['Salary'] not found in axis"

In [58]:
# To get access to columns, axis = 1

df.drop('City',axis = 1)

KeyError: "['City'] not found in axis"

In [46]:
# But it is not a permanent operation, the state of the DataFrame does not change

df

Unnamed: 0,Name,Age,City,Salary
0,Shubh,23,Singapore,150000
1,Krish,24,Bangalore,125000
2,Jack,25,NYC,100000
3,John,26,London,75000


In [47]:
# To save the changes, use inplace = True

df.drop('City',axis = 1, inplace = True)

In [48]:
df

Unnamed: 0,Name,Age,Salary
0,Shubh,23,150000
1,Krish,24,125000
2,Jack,25,100000
3,John,26,75000


In [49]:
df['Age'] = df['Age'] + 1
df

Unnamed: 0,Name,Age,Salary
0,Shubh,24,150000
1,Krish,25,125000
2,Jack,26,100000
3,John,27,75000


In [55]:
# Removes Row with index = 1
# Not a permanent operation

df.drop(1) 

KeyError: '[1] not found in axis'

In [56]:
# Removes Row with index = 1
# Permanent operation

df.drop(1, inplace = True)

KeyError: '[1] not found in axis'

In [57]:
df

Unnamed: 0,Name,Age,Salary
0,Shubh,24,150000
2,Jack,26,100000
3,John,27,75000


In [59]:
df = pd.read_csv('data.csv')
df.head(3)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East


## Statistical Methods
- Data types of Elements in each column of csv file
- Statistical Summary

In [60]:
# Display the datatypes of each column
print("Data types:\n", df.dtypes)

Data types:
 Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object


In [61]:
# Describe the DataFrame
print("Statistical Summary:\n", df.describe())

Statistical Summary:
            Value       Sales
count  47.000000   46.000000
mean   51.744681  557.130435
std    29.050532  274.598584
min     2.000000  108.000000
25%    27.500000  339.000000
50%    54.000000  591.500000
75%    70.000000  767.500000
max    99.000000  992.000000


In [62]:
# 25,50,75 percentiles
# Mean, Standard Deviation, Min, Max
# Count - Number of records

df.describe()

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0
