# Inspecting Data

Astronaut data from Kaggle: https://www.kaggle.com/nasa/astronaut-yearbook

In [1]:
import pandas as pd
df = pd.read_csv("astronauts.csv")

In [2]:
# First two rows of the dataframe
df.head(2)

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
0,Joseph M. Acaba,2004.0,19.0,Active,5/17/1967,"Inglewood, CA",Male,University of California-Santa Barbara; Univer...,Geology,Geology,,,2,3307,2,13.0,"STS-119 (Discovery), ISS-31/32 (Soyuz)",,
1,Loren W. Acton,,,Retired,3/7/1936,"Lewiston, MT",Male,Montana State University; University of Colorado,Engineering Physics,Solar Physics,,,1,190,0,0.0,STS 51-F (Challenger),,


In [3]:
# Last row of the dataframe
df.tail(1)

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
356,George D. Zamka,1998.0,17.0,Retired,6/29/1962,"Jersey City, NJ",Male,US Naval Academy; Florida Institute of Technology,Mathematics,Engineering Management,Colonel,US Marine Corps (Retired),2,692,0,0.0,"STS-120 (Discovery), STS-130 (Endeavor)",,


In [4]:
# Three random but different rows in the dataframe
# (set replace=True to allow them to potentially double up)
df.sample(3)

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
163,Charles O. Hobaugh,1996.0,16.0,Retired,11/5/1961,"Bar Harbor, ME",Male,US Naval Academy,Aerospace Engineering,,Colonel,US Marine Corps (Retired),3,873,0,0.0,"STS-104 (Atlantis), STS-118 (Endeavor), ST-129...",,
10,Michael P. Anderson,1995.0,15.0,Deceased,12/25/1959,"Plattsburgh, NY",Male,University of Washington; Creighton University,Physics & Astronomy,Physics,Lieutenant Colonel,US Air Force,2,594,0,0.0,"STS-89 (Endeavor), STS-107 (Columbia)",2/1/2003,STS-107 (Columbia)
83,Robert L. Crippen,1969.0,7.0,Retired,9/11/1937,"Beaumont, TX",Male,University of Texas,Aerospace Engineering,,Captain,US Navy (Retired),4,565,0,0.0,"STS-1 (Columbia), STS-7 (Challenger), STS 41-C...",,


In [5]:
# The type and number of non-null values for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357 entries, 0 to 356
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Name                 357 non-null    object 
 1   Year                 330 non-null    float64
 2   Group                330 non-null    float64
 3   Status               357 non-null    object 
 4   Birth Date           357 non-null    object 
 5   Birth Place          357 non-null    object 
 6   Gender               357 non-null    object 
 7   Alma Mater           356 non-null    object 
 8   Undergraduate Major  335 non-null    object 
 9   Graduate Major       298 non-null    object 
 10  Military Rank        207 non-null    object 
 11  Military Branch      211 non-null    object 
 12  Space Flights        357 non-null    int64  
 13  Space Flight (hr)    357 non-null    int64  
 14  Space Walks          357 non-null    int64  
 15  Space Walks (hr)     357 non-null    flo

In [6]:
# Basic stats on all numeric columns
df.describe()

Unnamed: 0,Year,Group,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr)
count,330.0,330.0,357.0,357.0,357.0,357.0
mean,1985.106061,11.409091,2.364146,1249.266106,1.246499,7.707283
std,13.216147,5.149962,1.4287,1896.759857,2.056989,13.367973
min,1959.0,1.0,0.0,0.0,0.0,0.0
25%,1978.0,8.0,1.0,289.0,0.0,0.0
50%,1987.0,12.0,2.0,590.0,0.0,0.0
75%,1996.0,16.0,3.0,1045.0,2.0,12.0
max,2009.0,20.0,7.0,12818.0,10.0,67.0


In [7]:
# Shape of the dataframe (nrows, ncols)
df.shape

(357, 19)

In [8]:
# Correlation between all numeric columns
df.corr()

Unnamed: 0,Year,Group,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr)
Year,1.0,0.980934,0.03642,0.331386,0.210073,0.253502
Group,0.980934,1.0,-0.011386,0.325683,0.217891,0.261384
Space Flights,0.03642,-0.011386,1.0,0.325233,0.257073,0.258642
Space Flight (hr),0.331386,0.325683,0.325233,1.0,0.472796,0.454408
Space Walks,0.210073,0.217891,0.257073,0.472796,1.0,0.985755
Space Walks (hr),0.253502,0.261384,0.258642,0.454408,0.985755,1.0


In [9]:
# The number of each occurance for a series
df["Year"].value_counts()

1978.0    35
1996.0    35
1998.0    25
1990.0    23
1966.0    19
1995.0    19
1980.0    19
1992.0    19
1984.0    18
2000.0    17
1987.0    15
1963.0    14
1985.0    13
2004.0    11
1967.0    11
2009.0     9
1962.0     8
1969.0     7
1959.0     7
1965.0     6
Name: Year, dtype: int64

In [10]:
# And a whole host of math functions can be invoked on the dataframe as whole, like so
df.max()

  df.max()


Name                 Yvonne D. Cagle
Year                          2009.0
Group                           20.0
Status                       Retired
Birth Date                  9/9/1952
Birth Place              Yonkers, NY
Gender                          Male
Space Flights                      7
Space Flight (hr)              12818
Space Walks                       10
Space Walks (hr)                67.0
dtype: object

### Recap
* head
* tail
* sample
* info
* describe