# Data exploration with Pandas

In [1]:
import pandas as pd

In [2]:
housing = pd.read_csv('./data/california-housing.csv')
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   MedVal      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [4]:
!ls -lh data/california-housing.csv

-rw-r--r-- 1 rick446 rick446 1.9M Sep 11  2020 data/california-housing.csv


We can get quick descriptive statistics of numerical data:

In [5]:
housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


We can also look at data in a more fine-grained way

In [6]:
deciles = [i / 10 for i in range(10)]
deciles

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [7]:
housing.describe(percentiles=deciles)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
0%,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
10%,1.9038,13.0,3.790884,0.966415,510.0,2.07563,33.63,-122.29,0.823
20%,2.3523,17.0,4.266667,0.995448,710.0,2.338822,33.87,-121.98,1.072
30%,2.7401,20.0,4.611811,1.01495,859.0,2.511743,34.0,-121.37,1.34
40%,3.1406,25.0,4.934005,1.032452,1007.0,2.667583,34.1,-119.91,1.573
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797


What about just 90-100%?

In [8]:
percentiles = [i / 100 for i in range(90, 100)]
percentiles

[0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]

In [9]:
housing.describe(percentiles=percentiles)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
90%,6.15921,46.0,6.961188,1.172727,2566.0,3.885273,38.48,-117.25,3.766
91%,6.310133,47.0,7.062358,1.184375,2677.0,3.95612,38.55,-117.22,3.93049
92%,6.500576,48.0,7.175229,1.197751,2795.0,4.047905,38.6,-117.1712,4.10288
93%,6.696683,50.0,7.296364,1.2149,2932.27,4.133755,38.66,-117.13,4.31654
94%,6.940296,52.0,7.450781,1.239366,3095.66,4.227965,38.73,-117.1,4.521


`describe()` returns a dataframe, so you can do all the normal dataframe operations:

In [10]:
housing.describe(percentiles=deciles).iloc[4:]['AveRooms AveBedrms AveOccup'.split()]

Unnamed: 0,AveRooms,AveBedrms,AveOccup
0%,0.846154,0.333333,0.692308
10%,3.790884,0.966415,2.07563
20%,4.266667,0.995448,2.338822
30%,4.611811,1.01495,2.511743
40%,4.934005,1.032452,2.667583
50%,5.229129,1.04878,2.818116
60%,5.520848,1.065969,2.980381
70%,5.853437,1.087137,3.16838
80%,6.268581,1.115385,3.424272
90%,6.961188,1.172727,3.885273


## Filtering

Something *weird* is happening in the 99th percentile of AveOccup. Let's just look at that using Pandas filtering

In [11]:
outlier_threshold = housing['AveOccup'].mean() + 3 * housing.AveOccup.std()
outlier_threshold

34.22880384607723

In [12]:
outliers = housing.AveOccup > outlier_threshold
outliers.head()

0    False
1    False
2    False
3    False
4    False
Name: AveOccup, dtype: bool

In [13]:
outliers.shape

(20640,)

In [14]:
outliers.sum()

8

In [15]:
outliers.mean()

0.0003875968992248062

In [16]:
housing.loc[outliers].sort_values('AveOccup')

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
8874,9.337,52.0,7.285714,1.214286,1154.0,41.214286,34.06,-118.45,5.00001
16420,5.7485,26.0,5.366667,0.9,1542.0,51.4,37.89,-121.29,1.625
12104,1.625,8.0,7.6,0.95,1275.0,63.75,33.97,-117.33,1.625
9172,4.2391,5.0,5.12381,0.933333,8733.0,83.171429,34.47,-118.59,1.546
13034,6.1359,52.0,8.275862,1.517241,6675.0,230.172414,38.69,-121.15,2.25
16669,4.2639,46.0,9.076923,1.307692,6532.0,502.461538,35.32,-120.7,3.5
3364,5.5179,36.0,5.142857,1.142857,4198.0,599.714286,40.41,-120.51,0.675
19006,10.2264,45.0,3.166667,0.833333,7460.0,1243.333333,38.32,-121.98,1.375


In [17]:
housing.loc[housing.AveOccup > outlier_threshold].sort_values('AveOccup')

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
8874,9.337,52.0,7.285714,1.214286,1154.0,41.214286,34.06,-118.45,5.00001
16420,5.7485,26.0,5.366667,0.9,1542.0,51.4,37.89,-121.29,1.625
12104,1.625,8.0,7.6,0.95,1275.0,63.75,33.97,-117.33,1.625
9172,4.2391,5.0,5.12381,0.933333,8733.0,83.171429,34.47,-118.59,1.546
13034,6.1359,52.0,8.275862,1.517241,6675.0,230.172414,38.69,-121.15,2.25
16669,4.2639,46.0,9.076923,1.307692,6532.0,502.461538,35.32,-120.7,3.5
3364,5.5179,36.0,5.142857,1.142857,4198.0,599.714286,40.41,-120.51,0.675
19006,10.2264,45.0,3.166667,0.833333,7460.0,1243.333333,38.32,-121.98,1.375


What about mis-aligned indexes?

In [18]:
outliers.values

array([False, False, False, ..., False, False, False])

In [19]:
other_outliers = pd.Series(outliers.values, index=outliers.index + 20640)
other_outliers

20640    False
20641    False
20642    False
20643    False
20644    False
         ...  
41275    False
41276    False
41277    False
41278    False
41279    False
Length: 20640, dtype: bool

If you use a mis-aligned index, Pandas will complain

In [20]:
housing.loc[other_outliers]

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [21]:
housing.loc[other_outliers.values] # ... unless you refer to the underlying numpy array

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
3364,5.5179,36.0,5.142857,1.142857,4198.0,599.714286,40.41,-120.51,0.675
8874,9.337,52.0,7.285714,1.214286,1154.0,41.214286,34.06,-118.45,5.00001
9172,4.2391,5.0,5.12381,0.933333,8733.0,83.171429,34.47,-118.59,1.546
12104,1.625,8.0,7.6,0.95,1275.0,63.75,33.97,-117.33,1.625
13034,6.1359,52.0,8.275862,1.517241,6675.0,230.172414,38.69,-121.15,2.25
16420,5.7485,26.0,5.366667,0.9,1542.0,51.4,37.89,-121.29,1.625
16669,4.2639,46.0,9.076923,1.307692,6532.0,502.461538,35.32,-120.7,3.5
19006,10.2264,45.0,3.166667,0.833333,7460.0,1243.333333,38.32,-121.98,1.375


In [22]:
housing.loc[housing.AveOccup > outlier_threshold]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
3364,5.5179,36.0,5.142857,1.142857,4198.0,599.714286,40.41,-120.51,0.675
8874,9.337,52.0,7.285714,1.214286,1154.0,41.214286,34.06,-118.45,5.00001
9172,4.2391,5.0,5.12381,0.933333,8733.0,83.171429,34.47,-118.59,1.546
12104,1.625,8.0,7.6,0.95,1275.0,63.75,33.97,-117.33,1.625
13034,6.1359,52.0,8.275862,1.517241,6675.0,230.172414,38.69,-121.15,2.25
16420,5.7485,26.0,5.366667,0.9,1542.0,51.4,37.89,-121.29,1.625
16669,4.2639,46.0,9.076923,1.307692,6532.0,502.461538,35.32,-120.7,3.5
19006,10.2264,45.0,3.166667,0.833333,7460.0,1243.333333,38.32,-121.98,1.375


In [23]:
housing.loc[~(housing.AveOccup > outlier_threshold)].head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


## Pandas logical operators

Python does not allow overloading of `and`, `or`, and `not`, so Pandas uses `&`, `|`, and `~` for boolean operators

In [24]:
housing.loc[
    (housing.AveOccup > outlier_threshold)
    & (housing.Population > 3000)
]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
3364,5.5179,36.0,5.142857,1.142857,4198.0,599.714286,40.41,-120.51,0.675
9172,4.2391,5.0,5.12381,0.933333,8733.0,83.171429,34.47,-118.59,1.546
13034,6.1359,52.0,8.275862,1.517241,6675.0,230.172414,38.69,-121.15,2.25
16669,4.2639,46.0,9.076923,1.307692,6532.0,502.461538,35.32,-120.7,3.5
19006,10.2264,45.0,3.166667,0.833333,7460.0,1243.333333,38.32,-121.98,1.375


## Sorting

We can also sort values to see if maybe the 'weird' occupancy numbers line up with the 'weird' # of bedrooms

In [25]:
housing.sort_values('AveOccup', ascending=False).head(10)  # or .iloc[:10]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
19006,10.2264,45.0,3.166667,0.833333,7460.0,1243.333333,38.32,-121.98,1.375
3364,5.5179,36.0,5.142857,1.142857,4198.0,599.714286,40.41,-120.51,0.675
16669,4.2639,46.0,9.076923,1.307692,6532.0,502.461538,35.32,-120.7,3.5
13034,6.1359,52.0,8.275862,1.517241,6675.0,230.172414,38.69,-121.15,2.25
9172,4.2391,5.0,5.12381,0.933333,8733.0,83.171429,34.47,-118.59,1.546
12104,1.625,8.0,7.6,0.95,1275.0,63.75,33.97,-117.33,1.625
16420,5.7485,26.0,5.366667,0.9,1542.0,51.4,37.89,-121.29,1.625
8874,9.337,52.0,7.285714,1.214286,1154.0,41.214286,34.06,-118.45,5.00001
13366,4.2578,36.0,5.258824,1.117647,2886.0,33.952941,33.94,-117.63,1.833
5986,1.875,52.0,4.5,1.206349,2688.0,21.333333,34.1,-117.71,2.125


What about sorting by the average bumber of bedrooms?

In [26]:
housing.sort_values('AveBedrms', ascending=False).head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
1979,4.625,34.0,132.533333,34.066667,36.0,2.4,38.8,-120.08,1.625
1914,1.875,33.0,141.909091,25.636364,30.0,2.727273,38.91,-120.1,5.00001
11862,2.625,25.0,59.875,15.3125,28.0,1.75,40.27,-121.25,0.675
12447,1.6154,17.0,62.422222,14.111111,83.0,1.844444,33.97,-114.49,0.875
9676,3.2431,14.0,52.848214,11.410714,265.0,2.366071,37.64,-119.02,2.214
1240,3.125,11.0,47.515152,11.181818,82.0,2.484848,38.42,-120.19,0.775
1913,4.0714,19.0,61.8125,11.0,112.0,2.333333,39.01,-120.06,4.375
2395,3.875,23.0,50.837838,10.27027,64.0,1.72973,37.12,-119.34,1.25
1912,4.975,16.0,56.269231,10.153846,54.0,2.076923,39.01,-120.16,2.063
1102,2.4028,17.0,31.777778,9.703704,47.0,1.740741,40.06,-121.54,0.675


If we are aiming to fit a model that needs things to be normally distributed, this is going to be a problem.

Let's generate thresholds for:

- AveRooms
- AveBedrms
- AveOccup

And remove them from out dataset

In [27]:
def outlier(series):
    threshold = series.mean() + 2 * series.std()
    return series > threshold

In [28]:
outliers = (
    outlier(housing.AveRooms) 
    | outlier(housing.AveBedrms)
    | outlier(housing.AveOccup)
)
outliers.mean()

0.012403100775193798

In [29]:
housing_outliers = housing.loc[outliers]
housing_outliers.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
710,2.4196,26.0,8.518248,2.70073,253.0,1.846715,37.68,-122.08,2.75
1023,2.2417,15.0,10.515306,2.372449,573.0,2.923469,38.72,-119.93,0.979
1024,3.15,16.0,29.852941,5.323529,202.0,1.980392,38.52,-120.0,1.406
1030,3.0125,15.0,10.168591,2.057737,1103.0,2.547344,38.55,-120.25,1.117
1102,2.4028,17.0,31.777778,9.703704,47.0,1.740741,40.06,-121.54,0.675


In [30]:
clean_housing = housing[~outliers]
clean_housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [31]:
clean_housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal
count,20384.0,20384.0,20384.0,20384.0,20384.0,20384.0,20384.0,20384.0,20384.0
mean,3.870837,28.760106,5.273221,1.064027,1434.244456,2.937328,35.617946,-119.575838,2.073395
std,1.893866,12.568312,1.227493,0.114614,1130.547167,0.868192,2.127513,2.00072,1.154186
min,0.4999,1.0,0.846154,0.333333,3.0,0.75,32.54,-124.35,0.14999
25%,2.5634,18.0,4.430393,1.005501,796.0,2.434783,33.93,-121.8,1.2
50%,3.53635,29.0,5.211935,1.047736,1173.0,2.82454,34.25,-118.49,1.8055
75%,4.74685,37.0,6.013234,1.096963,1731.0,3.286385,37.71,-118.01,2.655
max,15.0001,52.0,10.375479,2.04,35682.0,21.333333,41.95,-114.55,5.00001


## Counting things

Sometimes our data isn't all numerical

In [32]:
crime = pd.read_csv('./data/sf_crime_truncated.csv', parse_dates=['Dates'])
crime.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2009-06-11 13:45:00,OTHER OFFENSES,CONSPIRACY,Thursday,TARAVAL,JUVENILE BOOKED,19TH AV / OCEAN AV,-122.474954,37.732456
1,2005-10-17 12:00:00,ASSAULT,THREATENING PHONE CALL(S),Monday,TARAVAL,NONE,1500 Block of SLOAT BL,-122.489714,37.73395
2,2012-09-20 20:30:00,NON-CRIMINAL,LOST PROPERTY,Thursday,MISSION,NONE,1800 Block of FOLSOM ST,-122.415605,37.767718
3,2006-03-25 15:28:00,SECONDARY CODES,DOMESTIC VIOLENCE,Saturday,RICHMOND,"ARREST, BOOKED",800 Block of 28TH AV,-122.487534,37.773336
4,2013-10-01 00:33:00,WARRANTS,ENROUTE TO PAROLE OFFICER,Tuesday,MISSION,"ARREST, BOOKED",1200 Block of CHURCH ST,-122.427465,37.751296


In [33]:
crime.describe()

Unnamed: 0,X,Y
count,20000.0,20000.0
mean,-122.422499,37.772153
std,0.031782,0.522885
min,-122.513642,37.708154
25%,-122.43322,37.752239
50%,-122.416349,37.775421
75%,-122.406841,37.784401
max,-120.5,90.0


How do our 'object' columns distribute?

In [34]:
pd.value_counts(crime.Category).head(10)

LARCENY/THEFT     3983
OTHER OFFENSES    2794
NON-CRIMINAL      2102
ASSAULT           1793
VEHICLE THEFT     1270
DRUG/NARCOTIC     1264
VANDALISM          970
WARRANTS           922
BURGLARY           860
SUSPICIOUS OCC     727
Name: Category, dtype: int64

In [35]:
pd.value_counts(crime.DayOfWeek)

Wednesday    3029
Friday       3025
Saturday     2889
Thursday     2856
Tuesday      2798
Monday       2734
Sunday       2669
Name: DayOfWeek, dtype: int64

In [36]:
pd.value_counts(crime[crime.Category == 'ASSAULT'].DayOfWeek)

Sunday       292
Saturday     287
Friday       278
Wednesday    249
Monday       246
Thursday     231
Tuesday      210
Name: DayOfWeek, dtype: int64

In [37]:
pd.value_counts(crime[crime.Category == 'WARRANTS'].DayOfWeek)

Wednesday    158
Thursday     139
Monday       133
Tuesday      127
Friday       127
Sunday       122
Saturday     116
Name: DayOfWeek, dtype: int64

In [38]:
pd.value_counts(crime.PdDistrict)

SOUTHERN      3529
MISSION       2625
NORTHERN      2409
BAYVIEW       2102
CENTRAL       1977
TENDERLOIN    1874
INGLESIDE     1852
TARAVAL       1487
PARK          1129
RICHMOND      1016
Name: PdDistrict, dtype: int64

Open [Pandas exploration lab][pandas-exploration-lab]

[pandas-exploration-lab]: ./pandas-exploration-lab.ipynb