# Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy.stats import zscore
import requests
from io import StringIO
import string
from operator import length_hint
import time
import glob
import os

root_path = 'D:/Documents/Data Analysis/Datasets/pandas-workout-data/data/'

# 1. Series

## Exercise 01 - Test scores

In [7]:
# instantiate random number generator with seed 0
g = np.random.default_rng(0)

# generate reproducible random test scores
month_scores = g.integers(70, 101, 10)

In [8]:
# datetime cannot return full written month name, must use index instead
test_date = datetime(2024, 7, 14)
test_date.month

7

In [9]:
# date_range creates reusability, customizability
months = pd.date_range(start='2023-09', end='2024-07', freq='M').month_name()
months

Index(['September', 'October', 'November', 'December', 'January', 'February',
       'March', 'April', 'May', 'June'],
      dtype='object')

In [10]:
# create "grades" series combining scores with month index
grades = pd.Series(month_scores, index=months)
grades

September    96
October      89
November     85
December     78
January      79
February     71
March        72
April        70
May          75
June         95
dtype: int64

In [11]:
# avg test score whole school year
grades.mean()

81.0

In [12]:
# avg score first 5 months
first_half = grades.iloc[:5].mean()
first_half

85.4

In [13]:
# avg score last 5 months
second_half = grades.iloc[5:].mean()
second_half

76.6

In [14]:
# did student improve? by how much?
print(f'Student improved score by {second_half - first_half}')
if second_half < first_half:
    print('Study up!')

Student improved score by -8.800000000000011
Study up!


### Exercise 01b

In [16]:
# month highest score?
month_high = grades.idxmax()
month_high

'September'

In [17]:
# 5 highest scores
grades.sort_values(ascending=False).head().to_list()

[96, 95, 89, 85, 79]

In [18]:
# round scores to nearest 10
(grades/10).round() * 10

September    100.0
October       90.0
November      80.0
December      80.0
January       80.0
February      70.0
March         70.0
April         70.0
May           80.0
June         100.0
dtype: float64

### Notes
- could have used index method instead
- .split() can make it easier to type out a list, but looks messy/unclear and can cause issues when there are spaces
- iloc and loc both work here (loc takes twice as long to run)
- could even also used head() and tail() to find first/last 5 values

In [20]:
grades.index='Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()
grades

Sep    96
Oct    89
Nov    85
Dec    78
Jan    79
Feb    71
Mar    72
Apr    70
May    75
Jun    95
dtype: int64

## Exercise 02 - Scaling test scores

In [22]:
# instantiate RNG, seed 0
g = np.random.default_rng(0)

# test scores for challenging test
hard_test_scores = g.integers(40, 61, 10)

# create abbreviated month index using split
month_index = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()

# combine into a series
hard_test = pd.Series(hard_test_scores, index=month_index)
hard_test

Sep    57
Oct    53
Nov    50
Dec    45
Jan    46
Feb    40
Mar    41
Apr    40
May    43
Jun    57
dtype: int64

In [23]:
# scale the challenging test scores to a mean of 80
hard_mean = hard_test.mean()
scaler = 80 - hard_mean

scaled_test = hard_test + scaler
scaled_test

Sep    89.8
Oct    85.8
Nov    82.8
Dec    77.8
Jan    78.8
Feb    72.8
Mar    73.8
Apr    72.8
May    75.8
Jun    89.8
dtype: float64

### Exercise 02b

In [25]:
# scale into grade labels using the standard deviation

# standard deviation
stand = hard_test.std()

# define grade cutoffs
A = hard_mean + stand
B = hard_mean
C = hard_mean - stand

# grab values so converts to categorical object
hard_list = hard_test.values

# cut into bins with grade labels
grade_cat = pd.cut(hard_list, bins=[0, C, B, A, 100], labels='D C B A'.split())

# ensure is categorical object
print(type(grade_cat))

# codes
print(grade_cat.codes)

# categories
print(grade_cat.categories)

# create series again (could've also used series originally but I wanted to test the methods)
std_grades = pd.Series(grade_cat, index=month_index)
std_grades

<class 'pandas.core.arrays.categorical.Categorical'>
[3 2 2 1 1 0 1 0 1 3]
Index(['D', 'C', 'B', 'A'], dtype='object')


Sep    A
Oct    B
Nov    B
Dec    C
Jan    C
Feb    D
Mar    C
Apr    D
May    C
Jun    A
dtype: category
Categories (4, object): ['D' < 'C' < 'B' < 'A']

In [26]:
# outliers (scores 2 std above or below mean)
above = hard_mean + (stand * 2)
below = hard_mean - (stand * 2)

outliers = (hard_test > above) | (hard_test < below)
print(outliers)
print(f'\nAny outliers?: {outliers.any()}')

Sep    False
Oct    False
Nov    False
Dec    False
Jan    False
Feb    False
Mar    False
Apr    False
May    False
Jun    False
dtype: bool

Any outliers?: False


In [27]:
# compare difference between mean and median
print(f' Mean: {hard_test.mean()}')
print(f' Mean: {hard_test.median()}')

# are fairly close, indicating relatively normal distribution

 Mean: 47.2
 Mean: 45.5


## Exercise 03 - Counting tens digits 

In [29]:
# instantiate RNG, seed 0
g = np.random.default_rng(0)

# generate random integers
counting = g.integers(0, 100, 10)

# create into series
counting = pd.Series(counting)
counting

0    85
1    63
2    51
3    26
4    30
5     4
6     7
7     1
8    17
9    81
dtype: int64

In [30]:
# display tens digits (data type conversion strategy)
for x in counting:
    if x >=10:
        x = str(x)
        x = x[0]
        x = int(x)
        print(x)
    else:
        print(0)

8
6
5
2
3
0
0
0
1
8


### Exercise 03b
- Range 0 - 10,000:
- just have to make sure using negative indexing
- smallest data type: np.int16 (or np.uint16)

In [32]:
# 10 floating point values between 0 and 1000
g = np.random.default_rng(0)

even = pd.Series(g.random(10) * 1000)
even

0    636.961687
1    269.786714
2     40.973524
3     16.527636
4    813.270239
5    912.755577
6    606.635776
7    729.496561
8    543.624991
9    935.072424
dtype: float64

In [33]:
# find numbers whose integer component is even
mask = ((even//1)/2).astype('str').str.get(-1).astype('int')==0
print(mask, '\n')

print(even.loc[mask])

0     True
1    False
2     True
3     True
4    False
5     True
6     True
7    False
8    False
9    False
dtype: bool 

0    636.961687
2     40.973524
3     16.527636
5    912.755577
6    606.635776
dtype: float64


### Notes
- other options are to manually convert to int to truncate or do floor division
- with strings, can use get and fillna to get same result in one line (split for readability with open parentheses)
- modulo makes the process of finding even numbers much easier

In [35]:
# manual conversion
(counting / 10).astype('int64')

0    8
1    6
2    5
3    2
4    3
5    0
6    0
7    0
8    1
9    8
dtype: int64

In [36]:
# floor division
(counting // 10)

0    8
1    6
2    5
3    2
4    3
5    0
6    0
7    0
8    1
9    8
dtype: int64

In [37]:
# string conversion w/ get and fillna
(
counting
    .astype('string')
    .str.get(-2)
    .fillna('0')
    .astype('int8')
)

0    8
1    6
2    5
3    2
4    3
5    0
6    0
7    0
8    1
9    8
dtype: int8

In [38]:
even.astype(np.int64) % 2

0    0
1    1
2    0
3    0
4    1
5    0
6    0
7    1
8    1
9    1
dtype: int64

## Exercise 04 - Descriptive statistics

In [40]:
# instantiate RNG, seed 0
g = np.random.default_rng(0)

# create a large normal distribution
normal_dist = pd.Series(g.normal(0, 100, 100000))
normal_dist

0         12.573022
1        -13.210486
2         64.042265
3         10.490012
4        -53.566937
            ...    
99995    -91.667135
99996   -231.480500
99997     -0.028179
99998   -109.645051
99999    -49.541294
Length: 100000, dtype: float64

In [41]:
# descriptive stats
stats = normal_dist.describe()
stats

count    100000.000000
mean         -0.090825
std         100.013350
min        -449.411704
25%         -67.292120
50%          -0.414699
75%          67.636542
max         473.195769
dtype: float64

In [42]:
# replace min with 5 * max
new_min = stats.loc['max'] * 5

normal_dist.iloc[normal_dist.idxmin()] = new_min

In [43]:
# new descriptive stats
new_stats =  normal_dist.describe()
new_stats

count    100000.000000
mean         -0.062671
std         100.282770
min        -402.315865
25%         -67.288054
50%          -0.409289
75%          67.640758
max        2365.978844
dtype: float64

### Exercise 04b

In [45]:
# demonstrate 1, 2, 3 SD away from mean
# 68, 95, 99.7 rule
g = np.random.default_rng(0)
normal_dist = pd.Series(g.normal(0, 100, 100000))

mean = normal_dist.mean()
std = normal_dist.std()

print(f'Mean: {mean}')
print(f'Standard Deviation: {std}')

Mean: -0.09082507731206121
Standard Deviation: 100.01335047331727


In [46]:
one_std = (normal_dist >= (mean-std)) & (normal_dist <= (mean+std))
print(f'Within one std: {one_std.mean()}')

two_std = (normal_dist >= (mean-std*2)) & (normal_dist <= (mean+std*2))
print(f'Within two std: {two_std.mean()}')

three_std = (normal_dist >= (mean-std*3)) & (normal_dist <= (mean+std*3))
print(f'Within three std: {three_std.mean()}')

Within one std: 0.68396
Within two std: 0.95461
Within three std: 0.99708


In [47]:
# mean of numbers greater than and less than mean

# greater than mean
greater_mean = normal_dist[normal_dist > mean].mean()
print(f'Mean of greater than: {greater_mean}')

# less than mean
less_mean = normal_dist[normal_dist < mean].mean()
print(f'Mean of less than: {less_mean}')

# they're relatively inverse in a normal dist

# averaged
print(f'Averaged: {(greater_mean + less_mean)/2}\n')

# compare to mean of entire dataset
print(f'Entire dataset mean: {mean}')

Mean of greater than: 79.92646167808225
Mean of less than: -79.66763213378562
Averaged: 0.12941477214831565

Entire dataset mean: -0.09082507731206121


In [48]:
# mean beyond 3 stds
mask = (normal_dist < (mean-std*3)) | (normal_dist > (mean+std*3))
normal_dist[mask].mean()

-11.606040282602287

### Notes
- remember '_' can be used to separate a number for clarification, e.g. 100_000
- can instead find ALL matches to the minimum (rather than just the first) with a mask s==s.min()

## Exercise 05 - Monday temperatures

In [51]:
# instantiate RNG, seed 0
g = np.random.default_rng(0)

# create random temperature values
temp_values = g.normal(20, 5, 28).round().astype('int8')

# define days of week
dow = 'Sun Mon Tue Wed Thu Fri Sat'.split()

# combine into a series
# multiplying index by 4 to extend it fourfold
temps = pd.Series(temp_values, index=dow*4)
temps

Sun    21
Mon    19
Tue    23
Wed    21
Thu    17
Fri    22
Sat    27
Sun    25
Mon    16
Tue    14
Wed    17
Thu    20
Fri     8
Sat    19
Sun    14
Mon    16
Tue    17
Wed    18
Thu    22
Fri    25
Sat    19
Sun    27
Mon    17
Tue    22
Wed    25
Thu    20
Fri    16
Sat    15
dtype: int8

In [52]:
# Mondays mean
temps.loc['Mon'].mean()

17.0

### Exercise 05b

In [54]:
# avg weekend temp
temps.loc[['Sat', 'Sun']].mean()

20.875

In [55]:
# how many instances of 2-degree or greater difference in temp
(abs(temps.diff()) >= 2).sum()

24

In [56]:
# 2 most common temps and how often each appears
temps.value_counts().head(2)

17    4
19    3
Name: count, dtype: int64

## Exercise 06 - Passenger frequency

In [58]:
# load in CSV
passengers = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\taxi-passenger-count.csv', header=None)

# squeeze into a series
passengers = passengers.squeeze()
passengers

0       1
1       1
2       1
3       1
4       1
       ..
9994    1
9995    1
9996    1
9997    6
9998    1
Name: 0, Length: 9999, dtype: int64

In [59]:
# proportion of single riders and "max" riders
prop = passengers.value_counts(normalize=True)

# single riders
solo = prop.loc[1]
print(f'Single riders: {(solo*100).round(2)}%')

# "max" riders, which is declared to be 6
max_peeps = prop.loc[6]
print(f'Max riders: {(max_peeps*100).round(2)}%')

Single riders: 72.08%
Max riders: 3.69%


### Exercise 06b

In [61]:
# 25, 50, 75 quantiles
print(f'Q1: {passengers.quantile(.25)}')
print(f'Q2: {passengers.quantile(.50)}')
print(f'Q3: {passengers.quantile(.75)}')

Q1: 1.0
Q2: 1.0
Q3: 2.0


In [62]:
# 3, 4, 5, or 6 passengers proportion
prop[[3, 4, 5, 6]].sum()

0.1477147714771477

## Exercise 07 - Long, medium, and short taxi rides

In [64]:
# load in CSV
distance = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\taxi-distance.csv', header=None)

# squeeze into a series
distance = distance.squeeze()
distance 

0       1.63
1       0.46
2       0.87
3       2.13
4       1.40
        ... 
9994    2.70
9995    4.50
9996    5.59
9997    1.54
9998    5.80
Name: 0, Length: 9999, dtype: float64

In [65]:
# map to category names

# Short <=2 miles
# Medium >2 & <=10 miles
# Long >10 miles

def categorize(x):
    if x <= 2:
        return 'Short'
    if x > 2 and x <= 10:
        return 'Medium'
    if x > 10:
        return 'Long'

# apply function to distance series
distance_cat = distance.apply(categorize)
distance_cat

0        Short
1        Short
2        Short
3       Medium
4        Short
         ...  
9994    Medium
9995    Medium
9996    Medium
9997     Short
9998    Medium
Name: 0, Length: 9999, dtype: object

In [66]:
# number of rides by category
distance_cat.value_counts()

0
Short     5890
Medium    3402
Long       707
Name: count, dtype: int64

### Exercise 07b

In [68]:
# compare mean and median
distance.describe().loc[['mean', '50%']]

mean    3.158511
50%     1.700000
Name: 0, dtype: float64

In [69]:
# combine passenger count and distance series
df = pd.concat([passengers, distance_cat], axis=1)

# rename columns
df.columns = ['passengers', 'distance']

# find counts of trip-length categories for single-passenger trips only
df.loc[df['passengers']==1,'distance'].value_counts()

distance
Short     4333
Medium    2387
Long       487
Name: count, dtype: int64

### Notes
- could also map using pd.cut, but it risks not properly categorizing some values in this instance

In [71]:
# map with pd.cut
distance_cut = pd.cut(distance, bins=[0, 2, 10, np.inf], labels=['Short', 'Medium', 'Long'], include_lowest=True)
distance_cut

0        Short
1        Short
2        Short
3       Medium
4        Short
         ...  
9994    Medium
9995    Medium
9996    Medium
9997     Short
9998    Medium
Name: 0, Length: 9999, dtype: category
Categories (3, object): ['Short' < 'Medium' < 'Long']