In [9]:
# load libraries numpy, pandas, matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
# load the dataset from the link
url = "https://songyao21.github.io/course_data/starwars.csv"
starwars = pd.read_csv(url)
starwars.head()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships
0,Luke Skywalker,172.0,77.0,blond,fair,blue,19.0,male,masculine,Tatooine,Human,"A New Hope, The Empire Strikes Back, Return of...","Snowspeeder, Imperial Speeder Bike","X-wing, Imperial shuttle"
1,C-3PO,167.0,75.0,,gold,yellow,112.0,none,masculine,Tatooine,Droid,"A New Hope, The Empire Strikes Back, Return of...",,
2,R2-D2,96.0,32.0,,"white, blue",red,33.0,none,masculine,Naboo,Droid,"A New Hope, The Empire Strikes Back, Return of...",,
3,Darth Vader,202.0,136.0,none,white,yellow,41.9,male,masculine,Tatooine,Human,"A New Hope, The Empire Strikes Back, Return of...",,TIE Advanced x1
4,Leia Organa,150.0,49.0,brown,light,brown,19.0,female,feminine,Alderaan,Human,"A New Hope, The Empire Strikes Back, Return of...",Imperial Speeder Bike,


In [11]:
# print the information, summary statistics, and check for missing values

# Display information about the dataframe
print("\nDataframe Info:")
print(starwars.info())

# Display summary statistics of the dataframe
print("\nSummary Statistics:")
print(starwars.describe())

# Check for missing values
print("\nMissing Values:")
print(starwars.isnull().sum())


Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        87 non-null     object 
 1   height      81 non-null     float64
 2   mass        59 non-null     float64
 3   hair_color  82 non-null     object 
 4   skin_color  87 non-null     object 
 5   eye_color   87 non-null     object 
 6   birth_year  43 non-null     float64
 7   sex         83 non-null     object 
 8   gender      83 non-null     object 
 9   homeworld   77 non-null     object 
 10  species     83 non-null     object 
 11  films       87 non-null     object 
 12  vehicles    11 non-null     object 
 13  starships   20 non-null     object 
dtypes: float64(3), object(11)
memory usage: 9.6+ KB
None

Summary Statistics:
           height         mass  birth_year
count   81.000000    59.000000   43.000000
mean   174.604938    97.311864   87.565116
std     34.77415

In [12]:
# report average height and mass of characters
height = starwars['height'].mean()
mass = starwars['mass'].mean()
print("Average height of characters is: ", height.round(2))
print("Average mass of characters is: ", mass.round(2))

Average height of characters is:  174.6
Average mass of characters is:  97.31


In [5]:
# report average height and mass of characters by their species
species_height_mass = starwars.groupby('species').agg({'height': 'mean', 'mass': 'mean'})
print(species_height_mass.round(2))

                height     mass
species                        
Aleena           79.00    15.00
Besalisk        198.00   102.00
Cerean          198.00    82.00
Chagrian        196.00      NaN
Clawdite        168.00    55.00
Droid           131.20    69.75
Dug             112.00    40.00
Ewok             88.00    20.00
Geonosian       183.00    80.00
Gungan          208.67    74.00
Human           178.00    81.31
Hutt            175.00  1358.00
Iktotchi        188.00      NaN
Kaleesh         216.00   159.00
Kaminoan        221.00    88.00
Kel Dor         188.00    80.00
Mirialan        168.00    53.10
Mon Calamari    180.00    83.00
Muun            191.00      NaN
Nautolan        196.00    87.00
Neimodian       191.00    90.00
Pau'an          206.00    80.00
Quermian        264.00      NaN
Rodian          173.00    74.00
Skakoan         193.00    48.00
Sullustan       160.00    68.00
Tholothian      184.00    50.00
Togruta         178.00    57.00
Toong           163.00    65.00
Toydaria

In [6]:
# create a new dataframe with name, height, mass, gender, and species
starwars_bmi_data = starwars[['name', 'height', 'mass', 'gender', 'species']].copy()
starwars_bmi_data.head()

Unnamed: 0,name,height,mass,gender,species
0,Luke Skywalker,172.0,77.0,masculine,Human
1,C-3PO,167.0,75.0,masculine,Droid
2,R2-D2,96.0,32.0,masculine,Droid
3,Darth Vader,202.0,136.0,masculine,Human
4,Leia Organa,150.0,49.0,feminine,Human


In [7]:
# replace height in cm with height/100, and create a new column BMI in the dataframe, BMI = mass / (height in meters)^2
starwars_bmi_data['height'] = starwars_bmi_data['height'] / 100
starwars_bmi_data['BMI'] = starwars_bmi_data['mass'] / (starwars_bmi_data['height'])**2
starwars_bmi_data.head()

Unnamed: 0,name,height,mass,gender,species,BMI
0,Luke Skywalker,1.72,77.0,masculine,Human,26.027582
1,C-3PO,1.67,75.0,masculine,Droid,26.892323
2,R2-D2,0.96,32.0,masculine,Droid,34.722222
3,Darth Vader,2.02,136.0,masculine,Human,33.330066
4,Leia Organa,1.5,49.0,feminine,Human,21.777778


In [8]:
# create a dataframe containing only characters of human species (i.e., filter), 
# and in descending order of BMI, also drop missing values
human_bmi_data = starwars_bmi_data[starwars_bmi_data['species'] == 'Human']
human_bmi_data = human_bmi_data.dropna()
human_bmi_data = human_bmi_data.sort_values(by='BMI', ascending=False)
human_bmi_data.head()

Unnamed: 0,name,height,mass,gender,species,BMI
5,Owen Lars,1.78,120.0,masculine,Human,37.874006
3,Darth Vader,2.02,136.0,masculine,Human,33.330066
6,Beru Whitesun Lars,1.65,75.0,feminine,Human,27.548209
16,Wedge Antilles,1.7,77.0,masculine,Human,26.643599
0,Luke Skywalker,1.72,77.0,masculine,Human,26.027582
