# Pandas 

In [10]:
import pandas as pd
import numpy as np 

In [32]:
print(pd.__version__) # casual version check

2.1.4


In [10]:
np.arange(0,20).reshape(5,4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

##### with array

In [11]:
## Creation of Dataframes

df=pd.DataFrame(data=np.arange(0,20).reshape(5,4), index=["Row1","Row2","Row3","Row4","Row5"],
                columns = ["col1","col2","col3","col4"])

In [4]:
df

Unnamed: 0,col1,col2,col3,col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [28]:
df.head() #First 5 Records By default


Unnamed: 0,col1,col2,col3,col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [30]:
df.tail() #Last 5 Records by default

Unnamed: 0,col1,col2,Col3,col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [7]:
#another way to define

mydataset = {
  'Cars': ["BMW", "Volvo", "Ford"],
  'Passings': [3, 7, 2],
  'City' : ['Banglore','Chandigarg','Mohali']
}

myvar = pd.DataFrame(mydataset)

myvar
# print(type(myvar))

Unnamed: 0,Cars,Passings,City
0,BMW,3,Banglore
1,Volvo,7,Chandigarg
2,Ford,2,Mohali


In [30]:
print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


Creating Dataframe from list of Dictionaries

In [9]:
data = [
    {'Name':'Suryansh Rana','Age':20,'City':'Mohali'},
    {'Name':'Rajneesh Rana','Age':20,'City':'Banglore'},
    {'Name':'Mahak Rana','Age':20,'City':'Chandigarh'}
]
pd.DataFrame(data)

Unnamed: 0,Name,Age,City
0,Suryansh Rana,20,Mohali
1,Rajneesh Rana,20,Banglore
2,Mahak Rana,20,Chandigarh


In [63]:
df = pd.DataFrame(
    {
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [32, 35, 58],
        "Sex": ["male", "male", "female"],
    }
)

df

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",32,male
1,"Allen, Mr. William Henry",35,male
2,"Bonnell, Miss. Elizabeth",58,female


In [66]:
df.iat[1,2]

'male'

In [15]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
                  index=['cobra', 'viper', 'sidewinder'],
                  columns=['max_speed', 'shield'])

In [16]:
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [46]:
df.at['cobra','shield']

2

In [48]:
df.at['viper','shield']

5

In [25]:
df.loc['viper']

max_speed    4
shield       5
Name: viper, dtype: int64

In [28]:
df.loc['cobra', 'max_speed']


1

# SERIES

### pandas.Series
### **__class pandas.Series(data=None, index=None, dtype=None, name=None, copy=None, fastpath=_NoDefault.no_default)__**

In [151]:
pd.Series(mydataset)

cars        [BMW, Volvo, Ford]
passings             [3, 7, 2]
dtype: object

#### Series with index

In [3]:
data = {'a':1,'b':2,'c':3}
series_dict = pd.Series(data)
series_dict

a    1
b    2
c    3
dtype: int64

#### Assign Index with data

In [7]:
import pandas as pd
data =[10,20,30,40,50,60,70]
index = ['a','b','c','d','e','f','g']
ser = pd.Series(data,index=index)
ser

a    10
b    20
c    30
d    40
e    50
f    60
g    70
dtype: int64

In [6]:
ser.head()

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [36]:
type(df)

pandas.core.frame.DataFrame

### Constructing Series from a dictionary with an Index specified

In [13]:
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['a', 'b', 'c'])
ser


a    1
b    2
c    3
dtype: int64

#### The keys of the dictionary match with the Index values, hence the Index values have no effect.

In [11]:
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['x', 'y', 'z'])
ser


x   NaN
y   NaN
z   NaN
dtype: float64

## Note
#### that the Index is first build with the keys from the dictionary. After this the Series is reindexed with the given Index values, hence we get all NaN as a result.

### Copy in Series
### Copy = Flase 
### with list

In [21]:
r = [1, 2]
ser = pd.Series(r, copy=False, index =['A','B'])
ser.iloc[0] = 999
ser



A    999
B      2
dtype: int64

In [22]:
r

[1, 2]

Due to input data type the Series has a copy of the original data even though copy=False, so the data is unchanged.

#### with array

In [33]:
r = np.array([1, 2])
ser = pd.Series(r, copy=False, index =['A','B'])
ser.iloc[0] = 999
ser


A    999
B      2
dtype: int32

In [34]:
r

array([999,   2])

## Due to input data type the Series has a view on the original data, so the data is changed as well.

### .index

.index fuction in series and dataframe

In [5]:
import pandas as pd
cities = ['Kolkata', 'Chicago', 'Toronto', 'Lisbon']
populations = [14.85, 2.71, 2.93, 0.51]
city_series = pd.Series(populations, index=cities)
df = pd.DataFrame(populations, index=cities)
city_series.index


Index(['Kolkata', 'Chicago', 'Toronto', 'Lisbon'], dtype='object')

In [4]:
city_series

Kolkata    14.85
Chicago     2.71
Toronto     2.93
Lisbon      0.51
dtype: float64

In [7]:
df.index

Index(['Kolkata', 'Chicago', 'Toronto', 'Lisbon'], dtype='object')

### .array extension

In [8]:
pd.Series([1, 2, 3]).array


<NumpyExtensionArray>
[1, 2, 3]
Length: 3, dtype: int64

In [10]:
s = pd.Series(['Ant', 'Bear', 'Cow'])
s

0     Ant
1    Bear
2     Cow
dtype: object

### .bytes (Size)

In [11]:
s.nbytes

24

In [13]:
s.size ## elements in series (size = length)

3

In [12]:
s.ndim

1

In [14]:
s.T

0     Ant
1    Bear
2     Cow
dtype: object

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Row1 to Row5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col1    5 non-null      int32
 1   col2    5 non-null      int32
 2   Col3    5 non-null      int32
 3   col4    5 non-null      int32
dtypes: int32(4)
memory usage: 120.0+ bytes


In [40]:
df.describe()  #only int and float type will be selected

Unnamed: 0,col1,col2,Col3,col4
count,5.0,5.0,5.0,5.0
mean,8.0,9.0,10.0,11.0
std,6.324555,6.324555,6.324555,6.324555
min,0.0,1.0,2.0,3.0
25%,4.0,5.0,6.0,7.0
50%,8.0,9.0,10.0,11.0
75%,12.0,13.0,14.0,15.0
max,16.0,17.0,18.0,19.0


# Indexing

In [48]:
df.head()

Unnamed: 0,col1,col2,Col3,col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


# by using Columns Names

In [160]:
df[['col1','col2']] ## this how we can get any col of an datafarme 

Unnamed: 0,col1,col2
Row1,0,1
Row2,4,5
Row3,8,9
Row4,12,13
Row5,16,17


In [67]:
df['col1'] # this data is not in table format because of [[]] in one dimension

Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Name: col1, dtype: int32

In [69]:
type(df['col1'])# when it is in [ 1D ] it is Series

pandas.core.series.Series

In [13]:
df[['col1']] # this data is in table format 

Unnamed: 0,col1
Row1,0
Row2,4
Row3,8
Row4,12
Row5,16


In [71]:
type(df[['col1']]) # when it is in [[ 2D ]] it is DataFrame

pandas.core.frame.DataFrame

# By using RowIndex[loc]

In [75]:
df.loc['Row1'] # series

col1    0
col2    1
col3    2
col4    3
Name: Row1, dtype: int32

In [77]:
df.loc[['Row1']] # Dataframe; you can check its data type

Unnamed: 0,col1,col2,col3,col4
Row1,0,1,2,3


In [79]:
type(df.loc[['Row1']]) # see Dataframe

pandas.core.frame.DataFrame

In [85]:
df.loc[['Row1','Row2']] # so you have to use the [[]] 2D cause 
#dataframe is considered as two to more sets;

Unnamed: 0,col1,col2,col3,col4
Row1,0,1,2,3
Row2,4,5,6,7


# RowIndex and ColumnIndex[iloc]

In [88]:
df.head()

Unnamed: 0,col1,col2,col3,col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [94]:
#df.iloc[row2:row4,col1:col2]

df.iloc[2:4,0:3]

Unnamed: 0,col1,col2,col3
Row3,8,9,10
Row4,12,13,14


In [14]:
df.iloc[2:3,2:]

Unnamed: 0,col3,col4
Row3,10,11


In [98]:
df.iloc[3:,2:]

Unnamed: 0,col3,col4
Row4,14,15
Row5,18,19


In [104]:
df.iloc[1:,[0,3]]

Unnamed: 0,col1,col4
Row2,4,7
Row3,8,11
Row4,12,15
Row5,16,19


In [112]:
#Convert Dataframes into Arrays
df.iloc[:,:].values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

# Basic Operations

In [117]:
df.isnull()

Unnamed: 0,col1,col2,col3,col4
Row1,False,False,False,False
Row2,False,False,False,False
Row3,False,False,False,False
Row4,False,False,False,False
Row5,False,False,False,False


In [119]:
df.isnull().sum() #if null then add otherwise 0

col1    0
col2    0
col3    0
col4    0
dtype: int64

In [125]:
## Creation of Dataframes

df=pd.DataFrame(data=[[1,np.nan,2],[1,3,4]], index=["Row1","Row2"], columns = ["col1","col2","col3"])

In [127]:
df.head()

Unnamed: 0,col1,col2,col3
Row1,1,,2
Row2,1,3.0,4


In [129]:
df.isnull()

Unnamed: 0,col1,col2,col3
Row1,False,True,False
Row2,False,False,False


In [133]:
df.isnull().sum()==0

col1     True
col2    False
col3     True
dtype: bool

In [141]:
df['col3'].value_counts() # how many time it is repeating

col3
2    1
4    1
Name: count, dtype: int64

In [143]:
df['col3'].unique() # how many unique values

array([2, 4], dtype=int64)

In [145]:
df>2

Unnamed: 0,col1,col2,col3
Row1,False,False,False
Row2,False,True,True


In [147]:
df['col2']>2 #df[['col2']>2 and cond2]

Row1    False
Row2     True
Name: col2, dtype: bool