# Hypothesis 1 - Impact of Defendant - Data Cleaning

### Import Dependencies

In [380]:
# Import dependencies
import numpy as np
import pandas as pd
from matplotlib import pyplot
from scipy.spatial.distance import mahalanobis
from scipy.stats import chi2
import plotly
import seaborn as sns

### Set Notebook Configurations

In [381]:
# Set configurations
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

%matplotlib inline

###  Import Original Data Set

In [382]:
# Import original dataset

df = pd.read_csv("mock-jury-stalking-data-original.csv")

df.head()

Unnamed: 0,ATTN-CIV1-1,ATTN-CIV1-2,ATTN-CIV1-3,ATTN-CIV1-4,ATTN-CIV1-5,ATTN-CIV1-6,ATTN-CIV1-7,DEC-RATE-CIV1,DEC-CIV1-1,WHY-DEC-CIV1,DEC-CIV1-2,COMPENSATORY-CIV1\n,COMP-MAX-CIV1,ATTN-CIV3-1,ATTN-CIV3-2,ATTN-CIV3-3,ATTN-CIV3-3.1,ATTN-CIV3-2.1,ATTN-CIV3-4,ATTN-CIV3-4.1,DEC-RATE-CIV3,DEC-CIV3-1,WHY-DEC-CIV3,DEC-CIV3-2,COMPENSATORY-CIV3\n,COMP-MAX-CIV3,ATTN-CRIM-1,ATTN-CRIM-2,ATTN-CRIM-3,ATTN-CRIM-4,ATTN-CRIM-5,ATTN-CRIM-6,ATTN-CRIM-7,DEC-RATE-CRIM,DEC-CRIM-1,WHY-DEC-CRIM,DEC-CRIM-2,COMPENSATORY-CRIM\n,COMP-MAX-CRIM,VICCRED,VICBELIEVE,VICHONEST,VICBLAME,VICRESP,VICDISTRESS,VICFEAR,VICANNOY,VICFLATTER,VICSYMP,VICANGER,VICGREED,VICLIKE,VICSELFISH,PERPCRED,PERPBELIEVE,PERPHONEST,PERPBLAME,PERPRESP,PERPDANGER,PERPDISTRESS,PERPFEAR,PERPSYMP,PERPANGER,GENDER,AGE,CITIZEN,RACE,JURYSERVE,TIMESSERVE,JURYCRIME,JURYOUTCOME,JURYUNANIMOUS,Unnamed: 72,Unnamed: 73,DEC-ALL,DUM-CIV3,ANY-DAMAGE-MAX,DUM-CIV-ALL
0,1.0,2.0,5.0,1.0,1.0,2.0,1.0,7.0,1.0,"Defendant admits to being highly emotional, ye...",1.0,5000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,9.0,10.0,10.0,1.0,1.0,8.0,6.0,10.0,2.0,10.0,1.0,1.0,6.0,1.0,3.0,3.0,2.0,10.0,10.0,5.0,5.0,4.0,2.0,6.0,1.0,48.0,1.0,3,2.0,,,,,A2VE5IV9OD2SK1,civ1,1.0,0,0.0,
1,1.0,2.0,5.0,1.0,1.0,3.0,1.0,15.0,1.0,I felt that a reasonable person would be very ...,1.0,10000.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,10.0,10.0,10.0,2.0,2.0,8.0,9.0,10.0,1.0,9.0,1.0,1.0,9.0,1.0,2.0,2.0,2.0,10.0,10.0,8.0,8.0,7.0,1.0,9.0,2.0,64.0,1.0,3,2.0,,,,,A25FJAJGTWFMP,civ1,1.0,0,1.0,
2,1.0,2.0,5.0,1.0,1.0,2.0,1.0,8.0,1.0,Her stories are very elaborate in how the emai...,1.0,5000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,7.0,8.0,9.0,1.0,7.0,9.0,8.0,10.0,1.0,1.0,9.0,1.0,9.0,1.0,1.0,4.0,5.0,10.0,10.0,9.0,9.0,9.0,1.0,9.0,2.0,24.0,1.0,1,2.0,,,,,A39KJNWAFOD7N1,civ1,1.0,0,0.0,
3,1.0,2.0,5.0,1.0,1.0,3.0,1.0,6.0,2.0,"If he had been asked to stop in writing, like ...",2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,4.0,4.0,1.0,1.0,7.0,8.0,6.0,3.0,6.0,1.0,1.0,5.0,5.0,6.0,7.0,7.0,8.0,7.0,2.0,1.0,1.0,6.0,2.0,1.0,33.0,1.0,3,2.0,,,,,A1U46YK7C5HEY1,civ1,0.0,0,,
4,1.0,2.0,5.0,1.0,1.0,3.0,1.0,1.0,2.0,I believe there is a lot of circumstantial evi...,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,5.0,6.0,1.0,1.0,6.0,1.0,4.0,1.0,10.0,1.0,1.0,6.0,1.0,10.0,10.0,10.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,4.0,40.0,1.0,8,2.0,,,,,A3NMU6AVMQ0QDB,civ1,0.0,0,,


### Coding Legend

| | | | | | | | | | | |
|:-------------:|---------------------|---------------------|-------------------|----------------------------|-----------------------|----------------|-------------|---|---|---|
|   __GENDER__  | `1` = Male          | `2` = Female        | `3` = Transgender | `4` = Prefer not to answer |                       |                |             |
| __CITIZEN__   | `1` = Yes           | `2` = No            |                   |                            |                       |                |             |
| __RACE__      | `1` = Asian         | `2` = Black         | `3` = White       | `4` = Middle Eastern       | `5` = Native American | `6` = Hispanic | `8` = Other |
| __JURYSERVE__ | `1` = Yes           | `2` = No            |                   |                            |                       |                |             |
| __DEC-ALL__   | `0` = Pro-defendant | `1` = Pro-plaintiff |                   |                            |                       |                |             |

### Select Relevant Variables Only

In [383]:
hyp1_df = df[["DEC-ALL", "DUM-CIV3", "GENDER", "AGE"]].copy()
hyp1_df.head()

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE
0,1.0,0,1.0,48.0
1,1.0,0,2.0,64.0
2,1.0,0,2.0,24.0
3,0.0,0,1.0,33.0
4,0.0,0,4.0,40.0


In [384]:
hyp1_df.shape

(216, 4)

In [385]:
hyp1_df.describe()

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE
count,188.0,216.0,182.0,182.0
mean,0.691489,0.324074,1.593407,45.269231
std,0.463112,0.469115,0.555794,12.823021
min,0.0,0.0,1.0,23.0
25%,0.0,0.0,1.0,35.0
50%,1.0,0.0,2.0,43.0
75%,1.0,1.0,2.0,54.0
max,1.0,1.0,4.0,74.0


<div style="padding: 20px; border-radius: 20px; background-color: #f9f9f9;">
    <div style="display: flex; flex-direction: row; align-items: flex-start;">
        <div style="flex-shrink: 0; margin-right: 20px; font-size: 2.5em;">⚠️</div>
        <div style="flex-grow: 1;">
            <p>We can see just by the <code>count</code> for each variable in the description matrix above that there are some missing values.<br />
            Let's take a closer look and find all the rows that have <i>any</i> missing values 👇</p>
    </div>
</div>

In [386]:
hyp1_df["any_missing_values"] = hyp1_df.isnull().any(axis=1)
hyp1_df[hyp1_df["any_missing_values"] == True][["DEC-ALL", "DUM-CIV3", "GENDER", "AGE"]]

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE
60,,0,,
119,,1,,
121,,1,,
122,,1,,
123,,1,,
126,,1,,
127,,1,,
128,,1,,
129,,1,,
131,,1,,


In [387]:
hyp1_df[hyp1_df["any_missing_values"] == True].shape

(34, 5)

Looks like we have `34` records that have at least one missing value.

Let's remove these rows, re-index the dataframe, and remove the `any_missing_values` column.

In [388]:
missing_values_index = hyp1_df[hyp1_df["any_missing_values"] == True].index

In [389]:
hyp1_df.drop(index=missing_values_index, axis=1, inplace=True)
hyp1_df.describe()

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE
count,182.0,182.0,182.0,182.0
mean,0.681319,0.296703,1.593407,45.269231
std,0.467251,0.458065,0.555794,12.823021
min,0.0,0.0,1.0,23.0
25%,0.0,0.0,1.0,35.0
50%,1.0,0.0,2.0,43.0
75%,1.0,1.0,2.0,54.0
max,1.0,1.0,4.0,74.0


In [390]:
hyp1_df = hyp1_df.reset_index(drop=True)
hyp1_df.drop(columns=["any_missing_values"], inplace=True)
hyp1_df.shape

(182, 4)

Ok from here we should have a full data set with no missing values. Let's shuffle the data set and take a look at a slice of it just for review.

In [391]:
hyp1_df.sample(frac=1).head(10)

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE
167,1.0,0,1.0,37.0
33,1.0,0,1.0,40.0
5,1.0,0,2.0,63.0
103,0.0,1,1.0,33.0
88,1.0,1,1.0,43.0
10,1.0,0,1.0,44.0
121,1.0,1,2.0,48.0
84,1.0,1,1.0,70.0
114,0.0,1,2.0,39.0
75,1.0,1,2.0,47.0


✅ Data looks good to me! Let's take this opportunity to convert the `float` values to `int` values.

In [392]:
hyp1_df = hyp1_df.astype({"DEC-ALL": int, "GENDER": int, "AGE": int})
hyp1_df.sample(frac=1).head(10)

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE
2,1,0,2,24
120,1,1,2,32
83,0,1,2,47
165,1,0,2,46
72,0,1,1,44
0,1,0,1,48
160,1,0,2,52
58,1,0,1,41
170,0,0,2,46
159,0,0,2,32


Let's take a look at each individual variable to see if there are any odd values we don't expect.

In [393]:
"GENDER", hyp1_df["GENDER"].unique(), "AGE", hyp1_df["AGE"].unique(), "DUM-CIV3", hyp1_df["DUM-CIV3"].unique(), hyp1_df["DEC-ALL"].unique()

('GENDER',
 array([1, 2, 4]),
 'AGE',
 array([48, 64, 24, 33, 40, 63, 66, 61, 57, 39, 44, 32, 38, 31, 42, 34, 30,
        55, 49, 51, 37, 46, 58, 62, 54, 53, 36, 65, 70, 73, 45, 41, 74, 29,
        71, 43, 47, 69, 25, 28, 60, 35, 23, 56, 50, 52]),
 'DUM-CIV3',
 array([0, 1]),
 array([1, 0]))

✅ Data looks good to me! Let's check for outliers.

---

### Check for Outliers

In [394]:
numerical_cols = hyp1_df.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    hyp1_df[f'{col}_zscore'] = hyp1_df[col].apply(lambda x: (x - hyp1_df[col].mean()) / hyp1_df[col].std())

In [395]:
hyp1_df.head()

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE,DEC-ALL_zscore,DUM-CIV3_zscore,GENDER_zscore,AGE_zscore
0,1,0,1,48,0.682035,-0.647732,-1.067674,0.212958
1,1,0,2,64,0.682035,-0.647732,0.731555,1.460714
2,1,0,2,24,0.682035,-0.647732,0.731555,-1.658675
3,0,0,1,33,-1.458144,-0.647732,-1.067674,-0.956813
4,0,0,4,40,-1.458144,-0.647732,4.330013,-0.41092


In [396]:
# Check if absolute value is >= 3
zscore_columns = ["DEC-ALL_zscore", "DUM-CIV3_zscore", "GENDER_zscore", "AGE_zscore"]
hyp1_df["z_score_3_outlier"] = (hyp1_df[zscore_columns].abs() >= 3).any(axis=1)
hyp1_df[hyp1_df["z_score_3_outlier"] == True]

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE,DEC-ALL_zscore,DUM-CIV3_zscore,GENDER_zscore,AGE_zscore,z_score_3_outlier
4,0,0,4,40,-1.458144,-0.647732,4.330013,-0.41092,True
43,1,0,4,34,0.682035,-0.647732,4.330013,-0.878828,True


Looks like we found 2 outliers ☝️ Let's remove them

In [397]:
hyp1_df.describe()

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE,DEC-ALL_zscore,DUM-CIV3_zscore,GENDER_zscore,AGE_zscore
count,182.0,182.0,182.0,182.0,182.0,182.0,182.0,182.0
mean,0.681319,0.296703,1.593407,45.269231,-3.9040810000000004e-17,-3.9040810000000004e-17,2.244847e-16,2.147245e-16
std,0.467251,0.458065,0.555794,12.823021,1.0,1.0,1.0,1.0
min,0.0,0.0,1.0,23.0,-1.458144,-0.6477322,-1.067674,-1.73666
25%,0.0,0.0,1.0,35.0,-1.458144,-0.6477322,-1.067674,-0.8008433
50%,1.0,0.0,2.0,43.0,0.6820351,-0.6477322,0.7315546,-0.1769654
75%,1.0,1.0,2.0,54.0,0.6820351,1.535365,0.7315546,0.6808668
max,1.0,1.0,4.0,74.0,0.6820351,1.535365,4.330013,2.240562


In [398]:
outliers_index = hyp1_df[~hyp1_df["z_score_3_outlier"]].index
hyp1_df.drop(index=outliers_index, inplace=True)
hyp1_df.describe()

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE,DEC-ALL_zscore,DUM-CIV3_zscore,GENDER_zscore,AGE_zscore
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,0.5,0.0,4.0,37.0,-0.388054,-0.647732,4.330013,-0.644874
std,0.707107,0.0,0.0,4.242641,1.513335,0.0,0.0,0.330861
min,0.0,0.0,4.0,34.0,-1.458144,-0.647732,4.330013,-0.878828
25%,0.25,0.0,4.0,35.5,-0.923099,-0.647732,4.330013,-0.761851
50%,0.5,0.0,4.0,37.0,-0.388054,-0.647732,4.330013,-0.644874
75%,0.75,0.0,4.0,38.5,0.14699,-0.647732,4.330013,-0.527897
max,1.0,0.0,4.0,40.0,0.682035,-0.647732,4.330013,-0.41092


In [399]:
hyp1_df = hyp1_df.reset_index(drop=True)
hyp1_df.drop(columns=["z_score_3_outlier"], inplace=True)
hyp1_df.shape

(2, 8)

In [400]:
hyp1_df.head()

Unnamed: 0,DEC-ALL,DUM-CIV3,GENDER,AGE,DEC-ALL_zscore,DUM-CIV3_zscore,GENDER_zscore,AGE_zscore
0,0,0,4,40,-1.458144,-0.647732,4.330013,-0.41092
1,1,0,4,34,0.682035,-0.647732,4.330013,-0.878828


Let's save the data set!

In [401]:
hyp1_df.to_csv("hypothesis-1-dataset.csv")