# My toolbox for data exploration using Pandas

In [89]:
# Pandas is a common package for Python data exploration
import pandas as pd
import numpy as np


Load tabular data from a CSV file

I will use the Titanic dataset because of its popularity.


In [90]:
# import the training and testing datasets into train_df and test_df pandas dataframes. UPDATE TO MATCH YOUR PATHS

train_df = pd.read_csv('/content/drive/My Drive/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/test.csv')

 Explore dataset properties




In [91]:
# return metadata describing the train/test  dataframes including the index dtype and columns, non-null values 
# and memory usage. the null values discovery will be important in deciding  missing value imputation technique

train_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [92]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [93]:
# this is another pandas method of determining missing values. As we can see, Age and Cabin have missing values. 
# There are many methods for dealing with missing values. I will have more on data imputation later.

train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [94]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [95]:
# return the number of rows and columns of the train/test dataframes

train_df.shape, test_df.shape

((891, 12), (418, 11))

In [96]:
# return the number of elements in the train/test datafarmes

train_df.size

10692

In [97]:
test_df.size

4598

In [98]:
# return the column header for the train/test dataframes. 

train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [99]:
# Notice how the "Survived" column/feature is missing from the test dataframe since 
# this dataset used to validate the trained model's accuracy in predicting the "Survived" value 

test_df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [100]:
# return the datatypes of the train dataframe columns/features

train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [101]:
# return the datatypes of the test dataframe columns/features

test_df.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [102]:
# return descriptive statistics about the test dataframe including those that summarize the central tendency, 
# dispersion and shape of a dataset’s distribution, excluding NaN (missing) values.

test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


Return data values

In [103]:
# return top 4 rows of train dataframe

train_df.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [104]:
# return bottom 4 rows of train dataframe

train_df.tail(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [105]:
# return 5 random rows of train dataframe

train_df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
91,92,0,3,"Andreasson, Mr. Paul Edvin",male,20.0,0,0,347466,7.8542,,S
580,581,1,2,"Christy, Miss. Julie Rachel",female,25.0,1,1,237789,30.0,,S
790,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
237,238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8.0,0,2,C.A. 31921,26.25,,S
806,807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S


In [106]:
# return values of selected columns of the train dataframe

train_df[['Fare','Survived','Sex']]

Unnamed: 0,Fare,Survived,Sex
0,7.2500,0,male
1,71.2833,1,female
2,7.9250,1,female
3,53.1000,1,female
4,8.0500,0,male
...,...,...,...
886,13.0000,0,male
887,30.0000,1,female
888,23.4500,0,female
889,30.0000,1,male


In [107]:
# return a range of rows based on start index and (up to but not including) end index

train_df.iloc[100:105]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
100,101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S
101,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
102,103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S
103,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
104,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.925,,S


In [108]:
# return row values based on row,column indexes

train_df.iloc[[110,220,315],[3,1]]

Unnamed: 0,Name,Survived
110,"Porter, Mr. Walter Chamberlain",0
220,"Sunderland, Mr. Victor Francis",1
315,"Nilsson, Miss. Helmina Josefina",1


In [109]:
# return row values based on a range of indexes

train_df.iloc[110:115,[3,1]]

Unnamed: 0,Name,Survived
110,"Porter, Mr. Walter Chamberlain",0
111,"Zabour, Miss. Hileni",0
112,"Barton, Mr. David John",0
113,"Jussila, Miss. Katriina",0
114,"Attalah, Miss. Malake",0


In [None]:
# iterate over rows and return selected columns. 
# in example below, iterate over dataframe and return all rows but only columns: Name, Sex, Survived

for index, row in train_df.iterrows():
    print(row['Name'], row['Sex'], str(row['Survived']))

-- this is a small sample of the returned rows

Braund, Mr. Owen Harris male 0

Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 1

Heikkinen, Miss. Laina female 1

Futrelle, Mrs. Jacques Heath (Lily May Peel) female 1

Allen, Mr. William Henry male 0

Moran, Mr. James male 0

McCarthy, Mr. Timothy J male 0

Palsson, Master. Gosta Leonard male 0

Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 1

Nasser, Mrs. Nicholas (Adele Achem) female 1

Sandstrom, Miss. Marguerite Rut female 1

Bonnell, Miss. Elizabeth female 1

Saundercock, Mr. William Henry male 0

(truncated to fit)
