In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# 1 Dataset: Stanford Open Policing Project  

[Stanford Open Policing Project ](https://openpolicing.stanford.edu/)

NC State Trooper Dataset (https://www.kaggle.com/celodev/open-policing-nc-statewide-2020-04-01)

In [None]:
# df = pd.read_csv("/kaggle/input/open-policing-nc-statewide-2020-04-01/nc_statewide_2020_04_01.csv")
df = pd.read_csv("/kaggle/input/open-policing-nc-statewide-2020-04-01/nc_statewide_2020_04_01.csv", nrows=5000000)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().sum()

# 2. Do men or women speed more often?

In [None]:
sns.catplot('subject_sex', data=df, kind="count", height=7)

In [None]:
df.subject_sex.value_counts()

Responding to this question, we must take consideration of the non-equivalent distribution of the data or use fraction.

In [None]:
print(df[df.reason_for_stop == 'Speed Limit Violation'].subject_sex.value_counts(normalize=True))
plt.figure(figsize=(12, 8))
df[df.reason_for_stop == 'Speed Limit Violation'].subject_sex.value_counts().plot(kind="bar")

In [None]:
df.loc[df.reason_for_stop == "Speed Limit Violation", "subject_sex"].value_counts(normalize=True)

## 2. 1. When a man is pulled over, How often is it for speeding?

In [None]:
df[df.subject_sex == "male"].reason_for_stop.value_counts(normalize=True)

## 2. 2. When a women is pulled over, How often is it for speeding?

In [None]:
df[df.subject_sex == "female"].reason_for_stop.value_counts(normalize=True)

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df[df.subject_sex == "female"].reason_for_stop.value_counts(normalize=True).plot(kind="bar")
plt.title("Women")

plt.subplot(2, 2, 2)
df[df.subject_sex == "male"].reason_for_stop.value_counts(normalize=True).plot(kind="bar")
plt.title("Men")

In [None]:
sns.catplot('reason_for_stop', data=df, hue='subject_sex', kind='count', height=8)

# 3. Does gender affect who gets searched during a stop?

In [None]:
df.search_conducted.value_counts()

From all `88545` stoping cases the data only `3196` are searched.

In [None]:
df.loc[df.search_conducted, 'subject_sex'].value_counts()

From the stopped cases `2725` are `men` and only `471` are women.

In [None]:
df.groupby(['reason_for_stop', 'subject_sex']).search_conducted.mean()

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df.search_conducted.value_counts().plot(kind="bar")
plt.title("Searched Cases")

plt.subplot(2, 2, 2)
df.loc[df.search_conducted, 'subject_sex'].value_counts().plot(kind="bar")
plt.title("Searched Men and Women")

plt.subplot(2, 2, 3)
df.groupby(['reason_for_stop', 'subject_sex']).search_conducted.mean().plot(kind="bar")

# 4. Does race affect who gets searched during a stop?

In [None]:
sns.catplot('subject_race', data=df, kind="count", height=7)

In [None]:
df.groupby(['reason_for_stop', 'subject_race']).search_conducted.mean()

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df.subject_race.value_counts().plot(kind="bar")
plt.title("Stopped By Race")

plt.subplot(2, 2, 2)
df.loc[df.search_conducted, 'subject_race'].value_counts().plot(kind="bar")
plt.figure(figsize=(12, 12))plt.title("Searched By Race")

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df.loc[df.search_person, 'subject_race'].value_counts().plot(kind="bar")
plt.title("Person Searched By Race")

plt.subplot(2, 2, 2)
df.loc[df.frisk_performed, 'subject_race'].value_counts().plot(kind="bar")
plt.title("Frisked By Race")

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df.loc[df.search_vehicle, 'subject_race'].value_counts().plot(kind="bar")
plt.title("Vehicle Searched By Race")


In [None]:
sns.catplot('reason_for_stop', data=df, hue='subject_race', kind='count', height=8)

# 7. Which year had the least number of stops?

In [None]:
df.head()

In [None]:
print(df.date.dtype)
print(df.time.dtype)

In [None]:
df.date

In [None]:
df['stop_date'] = pd.to_datetime(df.date, format="%Y-%M-%d")
df["year"] = df.stop_date.dt.year

In [None]:
df.dtypes

In [None]:
df.year.value_counts()

In [None]:
plt.figure(figsize=(12, 8))
df.year.value_counts().plot(kind="bar")

# 8. How does drug activity change by time of day?

In [None]:
df.columns

In [None]:
df.contraband_drugs.value_counts()

In [None]:
df["stop_time"] = pd.to_datetime(df.time, format="%H:%M:%S").dt.hour
df.head()

In [None]:
df.loc[df.sort_values(by="stop_time").contraband_drugs.notnull(), 'stop_time'].value_counts()

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df.loc[df.sort_values(by="stop_time").contraband_drugs.notnull(), 'stop_time'].value_counts().sort_index().plot(kind="bar")

plt.subplot(2, 2, 2)
df.loc[df.sort_values(by="stop_time").contraband_drugs.notnull(), 'stop_time'].value_counts().sort_index().plot()

# 9. Do most stops occur at night?

In [None]:
df.stop_time.sort_index().value_counts().sort_index()

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df.stop_time.sort_index().value_counts().sort_index().plot()

plt.subplot(2, 2, 2)
df.stop_time.sort_index().value_counts().sort_index().plot(kind="bar")

# 12. Compare the age distributions for each violation

In [None]:
df.groupby("reason_for_stop").subject_age.describe()

In [None]:
plt.figure(figsize=(12, 12))

plt.subplot(2, 2, 1)
df.subject_age.hist(bins=10)

plt.subplot(2, 2, 2)
df.subject_age.value_counts().sort_index().plot()

In [None]:
df.hist('subject_age', by='reason_for_stop', figsize=(12, 12));

# 13. Can I duplicate open policing's "Veil of Darkness" findings?

In [None]:
sns.catplot("stop_time", data=df, hue="subject_race", kind="count", height=7)

In [None]:
#mapping = {0:0, 1:0, 2:0, 3:0, 11:1, 12:1, 13:1, 14: 1, 15: 1, 23:0}
mapping = {0:"night", 1:"night", 2:"night", 3:"night", 23:"night",
          11:"day", 12:"day", 13:"day", 14:"day", 15:"day"}
df['is_day'] = df.stop_time.map(mapping)

In [None]:
df.head(10)

In [None]:
sns.catplot("is_day", data=df, hue="subject_race", kind="count", height=7)