In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# ri stands for Rhode Island
ri = pd.read_csv('police.csv')
ri.head()

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,2005-01-02,01:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,2005-01-18,08:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,2005-01-23,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2005-02-20,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,2005-03-14,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


## 1. Remove the column that only contains missing values

In [4]:
ri.isnull().sum()/len(ri)

stop_date             0.000000
stop_time             0.000000
county_name           1.000000
driver_gender         0.058153
driver_age_raw        0.058066
driver_age            0.061270
driver_race           0.058131
violation_raw         0.058131
violation             0.058131
search_conducted      0.000000
search_type           0.965163
stop_outcome          0.058131
is_arrested           0.058131
stop_duration         0.058131
drugs_related_stop    0.000000
dtype: float64

In [7]:
ri.drop('county_name',axis=1,inplace=True)

In [8]:
ri.columns

Index(['stop_date', 'stop_time', 'driver_gender', 'driver_age_raw',
       'driver_age', 'driver_race', 'violation_raw', 'violation',
       'search_conducted', 'search_type', 'stop_outcome', 'is_arrested',
       'stop_duration', 'drugs_related_stop'],
      dtype='object')

## ## 2. Do men or women speed more often?

In [9]:
ri.nunique() # use violation than violation_raw

stop_date             3768
stop_time             1436
driver_gender            2
driver_age_raw          97
driver_age              78
driver_race              5
violation_raw           12
violation                6
search_conducted         2
search_type             24
stop_outcome             6
is_arrested              2
stop_duration            5
drugs_related_stop       2
dtype: int64

In [11]:
ri.groupby('driver_gender')['violation'].value_counts()

driver_gender  violation          
F              Speeding               15482
               Moving violation        3204
               Equipment               2487
               Registration/plates     1013
               Other                    690
               Seat belt                635
M              Speeding               32979
               Moving violation       13020
               Equipment               8533
               Other                   3627
               Registration/plates     2419
               Seat belt               2317
Name: violation, dtype: int64

In [None]:
## 3. Does gender affect who gets searched during a stop? ([video](https://www.youtube.com/watch?v=WzpGq1X5U1M&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=4))1

In [12]:
ri.head()

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,2005-01-02,01:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,2005-01-18,08:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,2005-01-23,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2005-02-20,17:15,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,2005-03-14,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [15]:
ri.groupby('driver_gender')['search_conducted'].value_counts(normalize=True).unstack()

search_conducted,False,True
driver_gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,0.979967,0.020033
M,0.956674,0.043326


In [20]:

from scipy.stats import chi2_contingency
table = ri.groupby('driver_gender')['search_conducted'].value_counts().unstack()
stat, p, dof, expected = chi2_contingency(table)

In [22]:
print(stat,p,dof,expected)

260.02171993380176 1.6973902964935056e-58 1 [[22641.37108534   869.62891466]
 [60568.62891466  2326.37108534]]


In [19]:
obs = np.array([[10, 10, 20], [20, 20, 20]])
obs

array([[10, 10, 20],
       [20, 20, 20]])

In [None]:
from scipy import stats
female_viq = ri[ri['driver_gender'] == 'F']['search_conducted']
male_viq = ri[ri['driver_gender'] == 'M']['search_conducted']
stats.ttest_ind(female_viq, male_viq) 

Why is search_type missing so often

In [24]:
ri.head()

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,2005-01-02,01:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,2005-01-18,08:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,2005-01-23,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2005-02-20,17:15,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,2005-03-14,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [29]:
ri.stop_date=R

stop_date              object
stop_time              object
driver_gender          object
driver_age_raw        float64
driver_age            float64
driver_race            object
violation_raw          object
violation              object
search_conducted         bool
search_type            object
stop_outcome           object
is_arrested            object
stop_duration          object
drugs_related_stop       bool
dtype: object

In [30]:
idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank'])
idx

Index([' jack', 'jill ', ' jesse ', 'frank'], dtype='object')

In [32]:
idx.str.strip()

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

In [27]:
import
ri['year']=ri.stop_date.dt.year()
#ri.stop_datetime.dt.year.value_counts()

AttributeError: Can only use .dt accessor with datetimelike values

In [25]:
ri.groupby('search_type')['search_conducted'].value_counts()

search_type                                               search_conducted
Incident to Arrest                                        True                1219
Incident to Arrest,Inventory                              True                 129
Incident to Arrest,Inventory,Probable Cause               True                  34
Incident to Arrest,Inventory,Protective Frisk             True                  11
Incident to Arrest,Inventory,Reasonable Suspicion         True                   4
Incident to Arrest,Probable Cause                         True                 106
Incident to Arrest,Probable Cause,Protective Frisk        True                  10
Incident to Arrest,Probable Cause,Reasonable Suspicion    True                   6
Incident to Arrest,Protective Frisk                       True                  33
Incident to Arrest,Protective Frisk,Reasonable Suspicion  True                   1
Incident to Arrest,Reasonable Suspicion                   True                  13
Inventory   

In [33]:
ri.stop_duration.value_counts()

0-15 Min     69543
16-30 Min    13635
30+ Min       3228
2                1
1                1
Name: stop_duration, dtype: int64