# Pandas Basics

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(101)
mydata = np.random.randint(0,101,(4,3))

In [3]:
mydata

array([[95, 11, 81],
       [70, 63, 87],
       [75,  9, 77],
       [40,  4, 63]])

In [4]:
myindex = ['CA','NY','AZ','TX']

In [5]:
mycolumns = ['Jan','Feb','Mar']

In [6]:
df = pd.DataFrame(mydata)

In [7]:
df

Unnamed: 0,0,1,2
0,95,11,81
1,70,63,87
2,75,9,77
3,40,4,63


In [8]:
df = pd.DataFrame(mydata,myindex)

In [9]:
df

Unnamed: 0,0,1,2
CA,95,11,81
NY,70,63,87
AZ,75,9,77
TX,40,4,63


In [10]:
df = pd.DataFrame(mydata,myindex,mycolumns)

In [11]:
df

Unnamed: 0,Jan,Feb,Mar
CA,95,11,81
NY,70,63,87
AZ,75,9,77
TX,40,4,63


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, CA to TX
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Jan     4 non-null      int32
 1   Feb     4 non-null      int32
 2   Mar     4 non-null      int32
dtypes: int32(3)
memory usage: 80.0+ bytes


In [13]:
df = pd.read_csv('tips.csv')

In [14]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [15]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [16]:
df.index

RangeIndex(start=0, stop=244, step=1)

In [17]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [19]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [20]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


## Columns

In [21]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [22]:
df['total_bill']

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [24]:
type(df['total_bill'])

pandas.core.series.Series

In [25]:
mycols = ['total_bill','tip']
df[mycols]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


In [27]:
#creating new columns

df['tip_percentage'] = 100 * df['tip'] / df['total_bill']

In [28]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
2,21.01,3.50,Male,No,Sun,Dinner,3,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.392697
240,27.18,2.00,Female,Yes,Sat,Dinner,2,7.358352
241,22.67,2.00,Male,Yes,Sat,Dinner,2,8.822232
242,17.82,1.75,Male,No,Sat,Dinner,2,9.820426


In [31]:
df['price_per_person'] = np.round(df['total_bill'] / df['size'],2)

In [32]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,3.45
2,21.01,3.50,Male,No,Sun,Dinner,3,16.658734,7.00
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765,6.15
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.392697,9.68
240,27.18,2.00,Female,Yes,Sat,Dinner,2,7.358352,13.59
241,22.67,2.00,Male,Yes,Sat,Dinner,2,8.822232,11.34
242,17.82,1.75,Male,No,Sat,Dinner,2,9.820426,8.91


In [34]:
#removing columns/row

#df.drop('column/row name',axis=0/1(row/column),inplace=flase/true(to remove from orginal data))
#df = df.drop('column/row name', axis=0/1) [another way of updating actual dataset]

## Index

In [33]:
df.index

RangeIndex(start=0, stop=244, step=1)

In [37]:
#to set a column as index

# df.set_index("column_name")

# df = df.set_index("column_name") [run only once] [permanently change index]

In [38]:
# to reset index

#df.reset_index()

## Grabbing rows

In [40]:
df.iloc[0]

total_bill             16.99
tip                     1.01
sex                   Female
smoker                    No
day                      Sun
time                  Dinner
size                       2
tip_percentage      5.944673
price_per_person        8.49
Name: 0, dtype: object

In [42]:
#based on index

#df.loc['labelled index']

In [44]:
#grabbing multiple rows

df.iloc[0:4]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,3.45
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734,7.0
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041,11.84


In [45]:
# df.loc[['label_name','label_name']]


In [46]:
# df.drop('row_name', axis=0) [not a permanent change]

# df = df.drop('row_name', axis=0) [permanent change]

In [48]:
#inserting new row

# one_row = df.iloc[0]
# one_row
# df = df.append(one_row)