# Pandas

- Pandas is an open-source library that is made mainly for working with relational or labeled data both easily and intuitively. 
- It provides various data structures and operations for manipulating numerical data and time series. 
- This library is built on top of the NumPy library. 
- Pandas is fast and it has high performance & productivity for users.
- Pandas generally provide two data structures for manipulating data, They are: 
   - Series
   - DataFrame

In [1]:
#Importing necessary libraries

In [2]:
import pandas as pd

# Series
- Series is a one-dimensional labeled array and capable of holding data of any type (integer, string, float, python objects, etc.)

In [3]:
#Series creation.
#List method
#Dictionary method

In [4]:
lst = [10,20,'Python','True']
index = ['a','b','c','d']
type(lst)

list

In [5]:
series = pd.Series(lst, index=index)
type(series)
series

a        10
b        20
c    Python
d      True
dtype: object

In [6]:
d = {10:'Python',20:'Machine_learning',30:'Deep_learning'}
type(d)

dict

In [7]:
d = pd.Series(d)
type(d)

pandas.core.series.Series

In [8]:
s = pd.Series(data=[10,20,50,62,52,85], index=[1,2,3,4,5,6],name='Series_1')
s

1    10
2    20
3    50
4    62
5    52
6    85
Name: Series_1, dtype: int64

In [9]:
#Accessing elements from a series using index labels.

In [10]:
s[4]

62

In [11]:
series

a        10
b        20
c    Python
d      True
dtype: object

In [12]:
series['d']

'True'

In [13]:
series[2:]

c    Python
d      True
dtype: object

# DataFrame
- A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns

**Ways to create pandas DataFrame.**
1. By creating a nested lists and colname lists.
2. By creating a dictionary

In [14]:
df = pd.DataFrame({
    'Name' : ['A','B','C'],
    'Marks' : [85,98,96],
    'Subject' : ['Python','Ml','Dl']
})

In [15]:
df

Unnamed: 0,Name,Marks,Subject
0,A,85,Python
1,B,98,Ml
2,C,96,Dl


In [16]:
#Check number of rows and columns in dataframe

In [17]:
df.shape

(3, 3)

In [18]:
#Dimesion of the dataframe

In [19]:
df.ndim

2

In [20]:
#Data type of each column

In [21]:
df.describe()

Unnamed: 0,Marks
count,3.0
mean,93.0
std,7.0
min,85.0
25%,90.5
50%,96.0
75%,97.0
max,98.0


In [22]:
df.describe(include=object)

Unnamed: 0,Name,Subject
count,3,3
unique,3,3
top,A,Python
freq,1,1


In [23]:
#Extracting column from dataframe

In [24]:
df.dtypes

Name       object
Marks       int64
Subject    object
dtype: object

In [25]:
df.describe(include='all')

Unnamed: 0,Name,Marks,Subject
count,3,3.0,3
unique,3,,3
top,A,,Python
freq,1,,1
mean,,93.0,
std,,7.0,
min,,85.0,
25%,,90.5,
50%,,96.0,
75%,,97.0,


In [26]:
#Getting information about the dataframe.

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     3 non-null      object
 1   Marks    3 non-null      int64 
 2   Subject  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [28]:
#Checking null values

In [29]:
df.isna().sum()

Name       0
Marks      0
Subject    0
dtype: int64

# Importing External Dataset 

In [30]:
#Load the csv file

df = pd.read_csv("Salaries.csv")

In [31]:
df

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [32]:
#Reading first 5 records

In [33]:
df.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800


In [34]:
#Reading last 5 records

In [35]:
df.tail()

Unnamed: 0,rank,discipline,phd,service,sex,salary
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954
77,Prof,A,23,15,Female,109646


In [36]:
df.head(15)

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
5,Prof,A,20,20,Male,122400
6,AssocProf,A,20,17,Male,81285
7,Prof,A,18,18,Male,126300
8,Prof,A,29,19,Male,94350
9,Prof,A,51,51,Male,57800


In [37]:
#Reading all the records.

In [38]:
# pd.set_option('display.max_rows',None)

In [39]:
df

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [40]:
#Describe the data

In [41]:
df.describe()

Unnamed: 0,phd,service,salary
count,78.0,78.0,78.0
mean,19.705128,15.051282,108023.782051
std,12.498425,12.139768,28293.661022
min,1.0,0.0,57800.0
25%,10.25,5.25,88612.5
50%,18.5,14.5,104671.0
75%,27.75,20.75,126774.75
max,56.0,51.0,186960.0


In [42]:
df.describe(include=object)

Unnamed: 0,rank,discipline,sex
count,78,78,78
unique,3,2,2
top,Prof,B,Male
freq,46,42,39


In [43]:
#Extracting only column names

In [44]:
df.rank

<bound method NDFrame.rank of          rank discipline  phd  service     sex  salary
0        Prof          B   56       49    Male  186960
1        Prof          A   12        6    Male   93000
2        Prof          A   23       20    Male  110515
3        Prof          A   40       31    Male  131205
4        Prof          B   20       18    Male  104800
..        ...        ...  ...      ...     ...     ...
73       Prof          B   18       10  Female  105450
74  AssocProf          B   19        6  Female  104542
75       Prof          B   17       17  Female  124312
76       Prof          A   28       14  Female  109954
77       Prof          A   23       15  Female  109646

[78 rows x 6 columns]>

In [45]:
df['rank']

0          Prof
1          Prof
2          Prof
3          Prof
4          Prof
        ...    
73         Prof
74    AssocProf
75         Prof
76         Prof
77         Prof
Name: rank, Length: 78, dtype: object

In [46]:
df[['rank']]

Unnamed: 0,rank
0,Prof
1,Prof
2,Prof
3,Prof
4,Prof
...,...
73,Prof
74,AssocProf
75,Prof
76,Prof


In [47]:
#Selecting single and multiple columns

In [52]:
df

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [53]:
df[['rank','phd']]

Unnamed: 0,rank,phd
0,Prof,56
1,Prof,12
2,Prof,23
3,Prof,40
4,Prof,20
...,...,...
73,Prof,18
74,AssocProf,19
75,Prof,17
76,Prof,28


In [48]:
#Selecting columns with loc and iloc

In [54]:
df.iloc[50:56,1:4]

Unnamed: 0,discipline,phd,service
50,B,10,5
51,B,20,14
52,A,12,0
53,A,5,3
54,A,25,22
55,A,2,0


In [61]:
df.iloc[[50,55],1:4]

Unnamed: 0,discipline,phd,service
50,B,10,5
55,A,2,0


In [49]:
#loc function

In [56]:
df.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800


In [58]:
df.loc[10:15,'rank':'discipline']

Unnamed: 0,rank,discipline
10,Prof,B
11,Prof,B
12,AsstProf,B
13,Prof,B
14,Prof,B
15,Prof,B


In [59]:
df.loc[:,'rank':'phd']

Unnamed: 0,rank,discipline,phd
0,Prof,B,56
1,Prof,A,12
2,Prof,A,23
3,Prof,A,40
4,Prof,B,20
...,...,...,...
73,Prof,B,18
74,AssocProf,B,19
75,Prof,B,17
76,Prof,A,28


In [50]:
#Value counts

In [62]:
df

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [63]:
df['rank'].unique()

array(['Prof', 'AssocProf', 'AsstProf'], dtype=object)

In [64]:
df['rank'].nunique()

3

In [65]:
df['rank'].value_counts()

Prof         46
AsstProf     19
AssocProf    13
Name: rank, dtype: int64

In [66]:
df['discipline'].value_counts()

B    42
A    36
Name: discipline, dtype: int64

In [67]:
df['salary'].value_counts()

92000     3
77500     2
72500     2
91000     2
62884     1
         ..
99418     1
91100     1
85000     1
73000     1
109646    1
Name: salary, Length: 73, dtype: int64

In [51]:
#Renaming column names

In [70]:
df.rename(columns={'rank':'Rank'}, inplace=True)

In [71]:
df

Unnamed: 0,Rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [72]:
df

Unnamed: 0,Rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [73]:
df = pd.DataFrame({
    'Name' : ['A','B','C'],
    'Machine Learning' : [10,20,30],
    'Python' : [74,78,74]
})
df

Unnamed: 0,Name,Machine Learning,Python
0,A,10,74
1,B,20,78
2,C,30,74


In [74]:
df.Name

0    A
1    B
2    C
Name: Name, dtype: object

In [75]:
df.Machine Learning

SyntaxError: invalid syntax (1100684828.py, line 1)

In [76]:
df['Machine Learning']

0    10
1    20
2    30
Name: Machine Learning, dtype: int64