## Pandas-Dataframe and Series

Pandas is a powerful data manipulation library in python that is used for data analysis and data cleaning.
It provide two primary data structures: Series and Dataframe.

In [2]:
import pandas as pd

### Series


In [20]:
## Series is a one dimensional array like object that can be used to hold any data type.
## It is similar to a coloumn inside a table

data=[1,2,3,4,5]
series = pd.Series(data)
print(series)
print(series.mean())

0    1
1    2
2    3
3    4
4    5
dtype: int64
3.0


In [4]:
data={'a':"hello",'b':2}
series = pd.Series(data)
print(series)

a    hello
b        2
dtype: object


In [19]:
#  we can also give custom indexing 

data = [1,2,3,4]
index = ['e','f','g','h']

s = pd.Series(data,index=index)
print(s)

e    1
f    2
g    3
h    4
dtype: int64


### Dataframes

In [6]:
## Dataframes are two-dimensional,size mutable and potentially hetrogeneous tabular data structure
## creating data frames from list of dictionaries

data=[
    {"name":"satwik","age":21,"dept":"IT"},
    {"name":"modak","age":21,"dept":"ETCE"},
    {"name":"diptarshi","age":21,"dept":"ETCE"},
    {"name":"priyam","age":21,"dept":"bekar"}
]

df = pd.DataFrame(data)
print(df)
print(type(df))

        name  age   dept
0     satwik   21     IT
1      modak   21   ETCE
2  diptarshi   21   ETCE
3     priyam   21  bekar
<class 'pandas.core.frame.DataFrame'>


In [7]:
## reading a csv file

df = pd.read_csv("data.csv")
df.head(5)   ## this only shows first five row


Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [8]:
data=[
    {"name":"satwik","age":21,"dept":"IT"},
    {"name":"modak","age":21,"dept":"ETCE"},
    {"name":"diptarshi","age":21,"dept":"ETCE"},
    {"name":"priyam","age":21,"dept":"bekar"}
]

df = pd.DataFrame(data)
##  Accesing a data frame
print(df["name"])   ## accessing a single coloumn directly by name and it is a series  

0       satwik
1        modak
2    diptarshi
3       priyam
Name: name, dtype: object


In [9]:
print(df.loc[0])  ## accesing a single row

name    satwik
age         21
dept        IT
Name: 0, dtype: object


In [10]:
print(df.loc[[0,1],["name","age"]])   ## here we can select a subset of the data using loc

     name  age
0  satwik   21
1   modak   21


In [11]:
print(df.at[1,'name'])   ## used to access a perticluar element

modak


### data maipulation

In [12]:
print(df)

        name  age   dept
0     satwik   21     IT
1      modak   21   ETCE
2  diptarshi   21   ETCE
3     priyam   21  bekar


In [13]:
df["marks"]=[1800,500,300,"jali"]  ## we basically add one more coloumn
print(df) 

        name  age   dept marks
0     satwik   21     IT  1800
1      modak   21   ETCE   500
2  diptarshi   21   ETCE   300
3     priyam   21  bekar  jali


In [14]:
df.drop("marks",axis=1)  ## this command drop the coloumn named marks
df.drop(3,axis=0)

## now the axis=1 means look for the x axis and axis=0 means look for the y

Unnamed: 0,name,age,dept,marks
0,satwik,21,IT,1800
1,modak,21,ETCE,500
2,diptarshi,21,ETCE,300


In [15]:
print(df) ## the changes made are temporary 
## it basically make a copy do the changes and return the copy

        name  age   dept marks
0     satwik   21     IT  1800
1      modak   21   ETCE   500
2  diptarshi   21   ETCE   300
3     priyam   21  bekar  jali


In [16]:
## for the changes to persist we need to the following
df.drop("marks",axis=1,inplace=True)
print(df)

        name  age   dept
0     satwik   21     IT
1      modak   21   ETCE
2  diptarshi   21   ETCE
3     priyam   21  bekar


In [17]:
## update the data
df.at[0,"age"]=20

In [18]:
df.describe()

Unnamed: 0,age
count,4.0
mean,20.75
std,0.5
min,20.0
25%,20.75
50%,21.0
75%,21.0
max,21.0
