In [None]:
import pandas as pd

grades = pd.Series([87, 100, 94])

# You can also do descriptive statistics on a series
grades.count()
grades.mean()
grades.min()
grades.max()
grades.std()

# You can also get to know your data by using the describe method
grades.describe()
# count      3.000000
# mean      93.666667
# std        6.506407
# min       87.000000
# 25%       90.500000 Median of the first half of the data set. 87 + 94 / 2 = 90.5
# 50%       94.000000 Median of the whole data set. 87 + 94 + 100 / 3 = 93.666
# 75%       97.000000 Median of the second half of the data set. 94 + 100 / 2 = 97
# max      100.000000

# Creating a series from a dictionary
grades = pd.Series({'Wally': 87, 'Eva': 100, 'Sam': 94})

#  Creating a series with custom indexes
grades = pd.Series([87, 100, 94], index=['Wally', 'Eva', 'Sam'])

# Get the type of the series
grades.dtype # dtype('int64')

# Get the number of elements in the series
grades.size # 3

hardware = pd.Series(['Hammer', 'Saw', 'Wrench'])

hardware.str.contains('a') # True if the string contains the letter 'a'
# 0     True
# 1     True
# 2    False
# dtype: bool

Self-check pandas series

In [None]:
# Use the Numpy's random number generation to create an array of integer that represent summertime temperatures in the range of 60 to 100
# then perform the following tasks:
# 1. Create an array into the Series names temperatures and display it.
# 2. Determine the lowest,highest, and average temperatures.
# 3. Produce descriptive statistics from the temperatures Series.

import numpy as np

temperatures = pd.Series(np.random.randint(60, 101, 10))
print(temperatures)
# 0    78
# 1    73
# 2    94
# 3    69
# 4    99
# 5    72
# 6    81
# 7    80
# 8    94
# 9    65
# dtype: int64
temperatures.min() # 65
temperatures.max() # 99
temperatures.mean() # 80.5
temperatures.describe()
# count    10.000000
# mean     80.500000
# std      11.616559
# min      65.000000
# 25%      72.250000
# 50%      79.000000
# 75%      90.750000
# max      99.000000
# dtype: float64

Pandas DataFrames

In [20]:
grades_dict = {'Wally': [87, 96, 70], 'Eva': [100, 87, 90], 'Sam': [94, 77, 90], 'Katie': [100, 81, 82], 'Bob': [83, 65, 85]}
grades = pd.DataFrame(grades_dict)
#       Wally	  Eva	  Sam	  Katie	  Bob
# 0	    87	    100	  94	  100	    83
# 1	    96	    87	  77    81	    65
# 2	    70	    90	  90	  82	    85

# Assign index names to the rows
grades.index = ['Test1', 'Test2', 'Test3']
#           Wally	  Eva	  Sam	  Katie	  Bob
# Test1	    87	    100	  94	  100	    83
# Test2     96	    87	  77    81	    65
# Test3	    70	    90	  90	  82	    85

# Get a row by using the loc attribute
grades.loc['Test1']
# Wally    87
# Eva      100
# Sam      94
# Katie    100
# Bob      83
# Name: Test1, dtype: int64

# Get a row by using the iloc attribute
grades.iloc[0]
# Wally    87
# Eva      100
# Sam      94
# Katie    100
# Bob      83
# Name: Test1, dtype: int64


# Slicing a DataFrame
grades.iloc[0:2]
#           Wally	  Eva	  Sam	  Katie	  Bob
# Test1	    87	    100	  94	  100	    83
# Test2     96	    87	  77    81	    65

# Select column one and three
grades.iloc[:, [0, 2]]
#           Wally	  Sam
# Test1	    87	    94
# Test2     96	    77
# Test3	    70	    90

# Select first two rows and first two columns
grades.iloc[0:2, 0:2]
#           Wally	  Eva
# Test1	    87	    100
# Test2     96	    87

# Boolean indexing
grades[grades >= 90]
#           Wally	  Eva	  Sam	  Katie	  Bob
# Test1	    NaN	    100.0	94.0	100.0	  NaN
# Test2	    96.0	   NaN	NaN	  NaN	    NaN
# Test3	    NaN	    90.0	90.0	NaN	    NaN

# Get value at a specific row and column
grades.iloc[0, 1] # 100

# Get value at a specific row and column using the at attribute
grades.at['Test1', 'Eva'] # 100

# Get value at a specific row and column using the iat attribute
grades.iat[0, 1] # 100

# Change the value at a specific row and column
grades.iat[0, 1] = 99


# Getting descriptive statistics
grades.describe()
#             Wally	     Eva	  Sam	      Katie	    Bob
# count	  3.000000	3.000000	3.000000	3.000000	3.000000
# mean	  84.333333	92.333333	87.000000	87.666667	77.666667
# std	    12.247449	5.773503	8.774964	9.273618	11.224972
# min	    70.000000	87.000000	70.000000	81.000000	65.000000
# 25%	    78.500000	88.500000	78.500000	81.500000	69.000000
# 50%	    87.000000	90.000000	87.000000	82.000000	73.000000
# 75%	    91.500000	94.500000	93.500000	91.000000	79.000000
# max	    96.000000	99.000000	100.000000	100.000000	85.000000

# Get the mean of each column
grades.mean()
# Wally    84.333333
# Eva      92.333333
# Sam      87.000000
# Katie    87.666667
# Bob      77.666667
# dtype: float64

# Get the mean of each row
grades.mean(axis=1)
# Test1    92.6
# Test2    83.2
# Test3    84.0
# dtype: float64

# Set precision to 2 decimal places
# pd.options.display.precision = 3
# OR
# pd.set_option('precision', 2)

# Transpose the DataFrame
grades.T.describe()
#       Test1	  Test2	  Test3
# count	5.000	  5.000	  5.000
# mean	92.600	81.200	83.400
# std	  7.436	  11.541	8.234
# min	  83.000	65.000	70.000
# 25%	  87.000	77.000	82.000
# 50%	  94.000	81.000	85.000
# 75%	  99.000	87.000	90.000
# max	  100.000	96.000	90.000

# Sort the DataFrame by a column
grades.sort_values(by='Eva')
#           Wally	  Eva	  Sam	  Katie	  Bob
# Test2     96	    87	  77    81	    65
# Test3	    70	    90	  90	  82	    85
# Test1	    87	    99	  94	  100	    83

# Sort the DataFrame by a row
grades.sort_values(by='Test1', axis=1)
#           Bob	  Eva	  Wally	  Sam	  Katie
# Test1	    83	    99	  87	    94	  100
# Test2     65	    87	  96	    77	  81
# Test3	    85	    90	  70	    90	  82

# Sort the DataFrame by a row in descending order
grades.sort_values(by='Test1', axis=1, ascending=False)
#           Katie	  Sam	  Wally	  Eva	  Bob
# Test1	    100	    94	  87	    99	  83
# Test2     81	    77	  96	    87	  65
# Test3	    82	    90	  70	    90	  85

# Sort the DataFrame by index
grades.sort_index(ascending=False)
#           Wally	  Eva	  Sam	  Katie	  Bob
# Test3	    70	    90	  90	  82	    85
# Test2     96	    87	  77    81	    65
# Test1	    87	    99	  94	  100	    83

# Sort the DataFrame by a column on a transposed DataFrame
grades.T.sort_values(by='Test1', ascending=False)
#       Test1	  Test2	  Test3
# Katie	100	    81	    82
# Sam	  94	    77	    90
# Eva	  99	    87	    90
# Wally	87	    96	    70
# Bob	  83	    65	    85

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,99,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


Self-Check

In [33]:
# Convert the dictionary named temperatures to a DataFrame with 'Low' and 'High' as the indices, then display the DataFrame.
temps = {'Monday': [68, 71], 'Tuesday': [71, 76], 'Wednesday': [75, 80], 'Thursday': [80, 84], 'Friday': [84, 90], 'Saturday': [87, 93], 'Sunday': [91, 96]}
temperatures = pd.DataFrame(temps, index=['High', 'Low'])
# 	    Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday
# High	68	    71	    75	       80	      84	    87	      91
# Low	   71	    76	    80	       84	      90	    93	      96

# Use the column names to select only the columns for 'Mon' through 'Wed'.
temperatures[['Monday', 'Tuesday', 'Wednesday']]
# 	    Monday	Tuesday	Wednesday
# High	68	    71	    75
# Low	   71	    76	    80

# Use the row index 'Low' to select only the low temperatures for each day.
temperatures.loc['Low']
# Monday       71
# Tuesday      76
# Wednesday    80
# Thursday     84
# Friday       90
# Saturday     93
# Sunday       96
# Name: Low, dtype: int64

# Set the floating point precision to 2 decimal places, then calculate the average temperature for each day.
pd.set_option('precision', 2)
temperatures.mean()
# Monday       69.50
# Tuesday      73.50
# Wednesday    77.50
# Thursday     82.00
# Friday       87.00
# Saturday     90.00
# Sunday       93.50
# dtype: float64

# Calculate the average low and high temperatures.
temperatures.mean(axis=1)
# High    78.14
# Low     81.86



High    79.43
Low     84.29
dtype: float64