In [1]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
[?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m846.5 kB/s[0m eta [36m0:00:00[0m eta [36m0:00:01[0m[36m0:00:01[0mm
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m829.7 kB/s[0m e

In [2]:
import numpy as np
import pandas as pd

### Creating data frame

In [4]:
d = {'Name':['Mary', 'Tom', 'Ann'], 'Age':[22, 19, 30]}
df = pd.DataFrame(d)
print (df)
print (df.dtypes)

   Name  Age
0  Mary   22
1   Tom   19
2   Ann   30
Name    object
Age      int64
dtype: object


In [5]:
a = np.array([['Mary', 22], ['Tom', 19], ['Ann', 30]])
df2 = pd.DataFrame(a, columns=['Name','Age'])
print (df2)
print (df2.dtypes)

   Name Age
0  Mary  22
1   Tom  19
2   Ann  30
Name    object
Age     object
dtype: object


In [6]:
df3 = df2.astype({'Age':'int'})
df3.dtypes

Name    object
Age      int64
dtype: object

In [7]:
df3.index = ['A', 'B', 'C'] # Change the default index 
df3

Unnamed: 0,Name,Age
A,Mary,22
B,Tom,19
C,Ann,30


#### Access subset of data frame

In [19]:
df

Unnamed: 0,Name,Age
0,Mary,22
1,Tom,19
2,Ann,30


In [21]:
df['Age']

0    22
1    19
2    30
Name: Age, dtype: int64

In [9]:
df[['Name', 'Age']]

Unnamed: 0,Name,Age
0,Mary,22
1,Tom,19
2,Ann,30


In [10]:
df['Age'] > 20 # Return a Boolean series

0     True
1    False
2     True
Name: Age, dtype: bool

In [11]:
df[ df['Age'] > 20 ] # Return the rows which has 'Age' > 20

Unnamed: 0,Name,Age
0,Mary,22
2,Ann,30


In [12]:
df.loc[0] # Return the row with index 0

Name    Mary
Age       22
Name: 0, dtype: object

In [23]:
df.loc[1]

Name    Tom
Age      19
Name: 1, dtype: object

In [13]:
df.loc[0, 'Name'] # Return the element at row index 0 and column label 'Name'

'Mary'

In [24]:
df.loc[ df['Age'] > 20 ] 

Unnamed: 0,Name,Age
0,Mary,22
2,Ann,30


In [25]:
df.loc[ df['Age'] > 20, 'Name'] # Return the names with age > 20

0    Mary
2     Ann
Name: Name, dtype: object

In [15]:
df.loc[:2, 'Name'] # Return rows 0 to 2-1 (slicing) and column label 'Name'

0    Mary
1     Tom
2     Ann
Name: Name, dtype: object

In [16]:
df.iloc[1] # Return rows with index 1

Name    Tom
Age      19
Name: 1, dtype: object

In [17]:
df.iloc[[1, 2]] # Return rows with index 1 & 2

Unnamed: 0,Name,Age
1,Tom,19
2,Ann,30


In [18]:
df.iloc[0, 1] # Return the element at row index 0 and column index 1

np.int64(22)

In [26]:
df.iloc[0:2, 0:1] # Return the rows before index 2 and 
                  # columns before index 1

Unnamed: 0,Name
0,Mary
1,Tom


In [27]:
df.iloc[1, 1] = 20
df.loc[0, 'Age'] = 23 # Modify a value
df.loc[ df.shape[0] ] = ['Bob', 17] # Add a new row
df

Unnamed: 0,Name,Age
0,Mary,23
1,Tom,20
2,Ann,30
3,Bob,17


### File IO

In [30]:
titanic_df = pd.read_csv("titanic.csv") # Read a DataFrame from titanic.csv
titanic_df # Show the DataFrame

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Descriptive Statistics

In [31]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [32]:
titanic_df['Age'].max() # Return the max of the Age column 

np.float64(80.0)

In [33]:
titanic_df[ ['Age','Fare'] ].min() # Return max of the Age and Fare columns

Age     0.42
Fare    0.00
dtype: float64

In [34]:
titanic_df['Age'].mean() # Return the mean of the Age column 

np.float64(29.69911764705882)

In [35]:
titanic_df['Age'].median() # Return the median of the Age column 

np.float64(28.0)

#### Handling Missing Values

In [36]:
titanic_df['Age'].isna()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool

In [37]:
titanic_df['Age'].fillna( 0 )

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     0.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [38]:
titanic_df['Age'].dropna()

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
885    39.0
886    27.0
887    19.0
889    26.0
890    32.0
Name: Age, Length: 714, dtype: float64