# ANALYSING POLICE DATASET

In [2]:
import pandas as pd

In [1]:
"a" == "A"

False

In [3]:
police = pd.read_csv("Police.csv")

In [6]:
police.dtypes

stop_date              object
stop_time              object
country_name          float64
driver_gender          object
driver_age_raw        float64
driver_age            float64
driver_race            object
violation_raw          object
violation              object
search_conducted         bool
search_type            object
stop_outcome           object
is_arrested            object
stop_duration          object
drugs_related_stop       bool
dtype: object

#### Instruction (For Data Cleaning)
### Remove the column that only contains missing values

In [7]:
police.isnull().sum()

stop_date                 0
stop_time                 0
country_name          65535
driver_gender          4061
driver_age_raw         4054
driver_age             4307
driver_race            4060
violation_raw          4060
violation              4060
search_conducted          0
search_type           63056
stop_outcome           4060
is_arrested            4060
stop_duration          4060
drugs_related_stop        0
dtype: int64

In [9]:
police.drop(columns="country_name", inplace=True)

In [11]:
police.isnull().sum()

stop_date                 0
stop_time                 0
driver_gender          4061
driver_age_raw         4054
driver_age             4307
driver_race            4060
violation_raw          4060
violation              4060
search_conducted          0
search_type           63056
stop_outcome           4060
is_arrested            4060
stop_duration          4060
drugs_related_stop        0
dtype: int64

#### Question ( Based on Filtering + Value Counts)
### 2. For Speeding, were Men or Women stopped more often ?

In [4]:
police.head()

Unnamed: 0,stop_date,stop_time,country_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [9]:
police.loc[1,]

KeyError: 3

In [24]:
police[police.violation == "Speeding"].driver_gender.value_counts()

M    25517
F    11686
Name: driver_gender, dtype: int64

#### Question (Groupby
### 3. Does gender affect who gets searched during a stop ?

In [25]:
police.head()

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [32]:
police.groupby("driver_gender").search_conducted.sum()

driver_gender
F     366
M    2113
Name: search_conducted, dtype: int64

#### Question (mapping - data-type casting)
### 4. What is the mean stop-duration ?

In [33]:
police.head()

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [36]:
#since the stop-duration column is of object datatype, we cant calculate the mean, so we need to convert it to a numeric datatype

#using map() function

police["stop_duration"] = police["stop_duration"].map({"0-15 Min":7.5, "16-30 Min":22.5,"30+ Min":41.5})

In [35]:
police["stop_duration"].value_counts()

0-15 Min     47379
16-30 Min    11448
30+ Min       2647
2                1
Name: stop_duration, dtype: int64

In [38]:
police["stop_duration"].mean() # Answer -- 11.757377102514884

11.757377102514884

#### Question ( Groupby , Describe)
### 5. Compare the age distributions for each violation

In [40]:
police.head(2)

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,7.5,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,7.5,False


In [51]:
police.groupby("violation").driver_age.describe()

police.groupby("driver_age").violation.describe()


Unnamed: 0_level_0,count,unique,top,freq
driver_age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15.0,5,2,Moving violation,4
16.0,34,5,Speeding,18
17.0,449,5,Speeding,338
18.0,1344,5,Speeding,980
19.0,2388,5,Speeding,1655
...,...,...,...,...
83.0,2,2,Speeding,1
84.0,3,1,Speeding,3
85.0,1,1,Moving violation,1
86.0,6,3,Speeding,3
