### pandas is a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series

### 1.Series

##### A Pandas Series is like a column in a table. It is a one-dimensional array holding data of any type.

In [1]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd

In [3]:
# supports dataframe and series
import numpy as np

In [None]:
pd.Series()

In [9]:
s = pd.Series(np.random.random(10)) # creating series object

In [10]:
s

0    0.560658
1    0.059011
2    0.996500
3    0.709677
4    0.094532
5    0.854995
6    0.344631
7    0.967426
8    0.492310
9    0.667409
dtype: float64

In [11]:
s.shape

(10,)

In [14]:
x1 =pd.Series(np.random.rand(5, 3)) 
x1

ValueError: ignored

In [13]:
x1.shape

(5, 3)

In [None]:
s.shape

(10,)

In [None]:
b = [1,2,3,4,5]

In [None]:
a = pd.Series(b)
a

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [None]:
a[1]

2

In [None]:
a[4]

5

In [None]:
s

0    0.784136
1    0.104314
2    0.995944
3    0.323785
4    0.671742
dtype: float64

In [None]:
s[0]   # indexing

0.851484584099524

In [None]:
c = pd.Series(np.random.random(5))
c

0    0.033636
1    0.783574
2    0.841589
3    0.118782
4    0.135174
dtype: float64

In [None]:
c[0:3]  # slicing

0    0.033636
1    0.783574
2    0.841589
dtype: float64

In [None]:
c[0] = 600 # update the value

In [None]:
c

0    600.000000
1      0.783574
2      0.841589
3      0.118782
4      0.135174
dtype: float64

In [None]:
c.index = ["x","y","z","a","b"] # change the index names

In [None]:
c

x    600.000000
y      0.783574
z      0.841589
a      0.118782
b      0.135174
dtype: float64

In [None]:
c['x']

600.0

In [None]:
c[0]

600.0

In [None]:
# creating a series object with index names
s = pd.Series(np.random.random(4),index = ["a","b","c","d"])

In [None]:
s

a    0.039665
b    0.457980
c    0.352058
d    0.993504
dtype: float64

In [None]:
c1 = [1,2,5,7,8,9,10,50,60]

In [None]:
c2 = pd.Series(c1)

In [None]:
# pulling the max value index number
c2.argmax()

8

In [None]:
# sorting the values 
c2.argsort()

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
dtype: int64

In [None]:
# cummulative summation of the data
c2.cumsum()

0      1
1      3
2      8
3     15
4     23
5     32
6     42
7     92
8    152
dtype: int64

In [None]:
# checking the mean
c2.mean()

16.88888888888889

In [None]:
# checking the median
c2.median()

8.0

In [None]:
# checking the max value
c2.max()

60

In [None]:
# checking the minimun value
c2.min()

1

In [None]:
# checking the descriptive stats of the data
s.describe()

count    4.000000
mean     0.460802
std      0.397057
min      0.039665
25%      0.273960
50%      0.405019
75%      0.591861
max      0.993504
dtype: float64

## Dataframe

In [15]:
s1 = pd.Series(np.random.random(5))

In [16]:
s2 = pd.Series(np.random.random(5))

In [17]:
s1,s2

(0    0.673029
 1    0.320586
 2    0.684241
 3    0.253314
 4    0.818490
 dtype: float64, 0    0.491424
 1    0.251627
 2    0.681156
 3    0.100073
 4    0.522883
 dtype: float64)

In [18]:
# creating a dataframe using series object
a= pd.DataFrame([s1,s2]) 

In [19]:
a

Unnamed: 0,0,1,2,3,4
0,0.673029,0.320586,0.684241,0.253314,0.81849
1,0.491424,0.251627,0.681156,0.100073,0.522883


In [20]:
# create a dataframe with the help of dictionary with age and salary
d = {"age":[22,23,24],"salary":[1000,1020,1040]}

In [21]:
d

{'age': [22, 23, 24], 'salary': [1000, 1020, 1040]}

In [22]:
type(d)

dict

In [23]:
# converting the dict to the dataframe
data = pd.DataFrame(d)

In [24]:
data

Unnamed: 0,age,salary
0,22,1000
1,23,1020
2,24,1040


In [25]:
type(data)

pandas.core.frame.DataFrame

In [26]:
# creating data frame using list
a = [1,2,3,4]
b = [10,11,12,13]

In [27]:
pd.DataFrame([a,b])

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,10,11,12,13


In [28]:
# giving the column names to the data
pd.DataFrame([a,b],columns=['col1','col2','col3','col4'])

Unnamed: 0,col1,col2,col3,col4
0,1,2,3,4
1,10,11,12,13


In [None]:
data

Unnamed: 0,age,salary
0,22,1000
1,23,1020
2,24,1040


In [29]:
# adding column to the data frame - gender
data['gender'] = ["male","female","other"]

In [30]:
data

Unnamed: 0,age,salary,gender
0,22,1000,male
1,23,1020,female
2,24,1040,other


### iloc() and loc() functions

In [32]:
# iloc and loc functions 
data.iloc[0]

age         22
salary    1000
gender    male
Name: 0, dtype: object

In [None]:
data.loc[2]

age         24
salary    1040
gender    male
Name: 2, dtype: object

In [None]:
data.iloc[:,2] # extract column information

0      male
1    female
2      male
Name: gender, dtype: object

In [None]:
data.iloc[3] # shows error , index beyond limit

In [None]:
data['salary'] # retreving the values using column name

0    1000
1    1020
2    1040
Name: salary, dtype: int64

In [None]:
data.salary

0    1000
1    1020
2    1040
Name: salary, dtype: int64

In [35]:
 # not able  to create a row using iloc
data.iloc[4] =[25,334,"male"]

IndexError: ignored

In [33]:
data.loc[3] =[28,335,"male"] # creating a row using loc function

In [34]:
data

Unnamed: 0,age,salary,gender
0,22,1000,male
1,23,1020,female
2,24,1040,other
3,28,335,male


In [37]:
data.iloc[3,2] # extract the specified value using iloc

'male'

In [None]:
data.shape # displays the rows and columns in the dataset

(5, 3)

In [38]:
data.columns # gives the column names

Index(['age', 'salary', 'gender'], dtype='object')

In [39]:
data.values # gives values of the dataset

array([[22, 1000, 'male'],
       [23, 1020, 'female'],
       [24, 1040, 'other'],
       [28, 335, 'male']], dtype=object)

In [40]:
data.dtypes # gives the data types

age        int64
salary     int64
gender    object
dtype: object

In [41]:
# combine two datasets
pd.concat([data,data])

Unnamed: 0,age,salary,gender
0,22,1000,male
1,23,1020,female
2,24,1040,other
3,28,335,male
0,22,1000,male
1,23,1020,female
2,24,1040,other
3,28,335,male


In [45]:
#drop column from the dataset
d2 = data.drop(columns=['salary'])

In [46]:
d2

Unnamed: 0,age,gender
0,22,male
1,23,female
2,24,other
3,28,male


In [None]:
data

Unnamed: 0,age,salary,gender
0,22,1000,male
1,23,1020,female
2,24,1040,other
3,22,333,male
4,28,335,male


In [None]:
#drop column from the dataset
data.drop(columns=['gender'],axis=1,inplace=True)
data

Unnamed: 0,age,salary
0,22,1000
1,23,1020
2,24,1040
3,22,333


In [None]:
data.loc[4] = [22,1000] # creating duplicate values

ValueError: ignored

In [None]:
data

Unnamed: 0,age,salary,gender
0,22,1000,male
1,23,1020,female
2,24,1040,other
3,22,333,male
4,28,335,male


In [47]:
data.loc[5] = [28, 335 , 'male']

In [48]:
data

Unnamed: 0,age,salary,gender
0,22,1000,male
1,23,1020,female
2,24,1040,other
3,28,335,male
5,28,335,male


In [49]:
data.drop_duplicates(inplace=True)

In [50]:
data

Unnamed: 0,age,salary,gender
0,22,1000,male
1,23,1020,female
2,24,1040,other
3,28,335,male


In [None]:
import numpy as np

In [51]:
# creating missing values
data['Experience'] = [1,np.nan,3,4]

In [52]:
data

Unnamed: 0,age,salary,gender,Experience
0,22,1000,male,1.0
1,23,1020,female,
2,24,1040,other,3.0
3,28,335,male,4.0


In [53]:
# fill the missing values
data['Experience'].fillna(100,inplace=True)

In [54]:
data

Unnamed: 0,age,salary,gender,Experience
0,22,1000,male,1.0
1,23,1020,female,100.0
2,24,1040,other,3.0
3,28,335,male,4.0


In [55]:
# replacing the values in the datset
data["Experience"].replace({100.0:2.0},inplace=True)

In [56]:
data

Unnamed: 0,age,salary,gender,Experience
0,22,1000,male,1.0
1,23,1020,female,2.0
2,24,1040,other,3.0
3,28,335,male,4.0


In [57]:
# checking for missing values
data.isnull() 

Unnamed: 0,age,salary,gender,Experience
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
