# Data Prep

In [None]:
# any Pip installs we need for this project
!pip install sklearn
!pip install pandas
!pip install earthpy


In [None]:
# Libraries we need for this project
import numpy as np
import pandas as pd
import os
import earthpy as et
import math
import statistics
from statistics import stdev
import requests
import os.path 
import csv 
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

# from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [6]:
# data prep functions

def get_arrest_data():
    # I just picked the biggest dataset - this will take a few minutes to download. luckly you will only need to 
    # download this once
    file_url = "https://stacks.stanford.edu/file/druid:yg821jf8611/yg821jf8611_ca_statewide_2020_04_01.csv.zip"
    data_file = et.data.get_data(url=file_url)
    fname = os.path.join(data_file, "ca_statewide_2020_04_01.csv")
    return pd.read_csv(fname, on_bad_lines='skip')

def shuffle_data(data, seed):
    np.random.seed(seed)
    try:
        np.random.shuffle(data.values)
    except:
        np.random.shuffle(data)
    return(data)

# splits data into training(2/3) and validation(1/3) dataframes
def split_data(data_frame):
    training_length = round(2/3 * len(data_frame))

    training_df = data_frame[:training_length]
    validation_df = data_frame[training_length:]

    return training_df, validation_df

# assuming that that yhat column is the last column in the dataset, this returns the features seperated from
# the yhat column. for example to use this function: 'arrest_feat_df, arrest_yhat_df = split_data_yhat(df)'
def split_data_yhat(df):
    # This function will not work on our dataset - our yhat is 'arrest_made' column and it is not the 
    # last column of the dataset. so if someone needs this - you will have to re-write this function.
    return df.iloc[:,:-1], df.iloc[:,-1]
def moveYcolumnToEnd(data, indexOfY):
    #takes a numpy array and moves column at "indexOfY" to the last column of the array.  
    #All columns at indexes greater than "indexOfY" are shifted to the left by one.
    numOfRow, NumOfColumns = data.shape
    if(indexOfY < NumOfColumns):
        data[:, indexOfY:] = np.roll(data[:, indexOfY:], -1, 1)
    else:
        print("Your Y index is out of range, cannot move column")
    return data

def get_arrest_data2():
    #gets the data from a local file if it exists
    if os.path.exists('tn_nashville_2020_04_01.csv'):
        print("File already exists, getting data locally.....")
        data = pd.read_csv('tn_nashville_2020_04_01.csv',on_bad_lines='skip', dtype=str)
        return data
    else:
        file_url = 'https://stacks.stanford.edu/file/druid:yg821jf8611/yg821jf8611_tn_nashville_2020_04_01.csv.zip'
        data_file = et.data.get_data(url=file_url)
        fname = os.path.join(data_file, 'tn_nashville_2020_04_01.csv')
        return pd.read_csv(fname, on_bad_lines='skip', dtype=str)

def encodeFeatures(dataframe,skipColumnsList=[]):
    le = preprocessing.LabelEncoder()
    for column in dataframe.columns:
        if column not in skipColumnsList:
            # Converting string labels into numbers.
            dataframe[column]=le.fit_transform(dataframe[column])

    return dataframe

def replaceNulls(data):
#takes in a numpy array and replaces missing null values with the most frequent values in that column
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp.fit(data)
    SimpleImputer()
    return imp.transform(data)

def truncateTime(time):
    #truncates time values to just the hour   
    return str(time)[0:2]

def convertAgeFeature(stringAge):
    young =0
    middle = 1
    old =2
    convertedAge = 0
    age = float(stringAge)
    if  age >= 25:
        if age < 50:
            convertedAge = 1
        else:
            convertedAge = 2
   
    return convertedAge

def getPreprocessedArrestData():
    arrest_dataFrame = get_arrest_data2().copy()
    arrest_dataFrame = arrest_dataFrame.dropna()
    arrest_data = arrest_dataFrame[['subject_race', 'subject_sex','subject_age','time','violation','frisk_performed','search_vehicle','warning_issued']].copy()
    #print(arrest_data.head(10))
    #arrest_data = truncateTime(data)
    arrest_data['time'] = arrest_data['time'].apply(lambda x : truncateTime(x))
    arrest_data['subject_age'] = arrest_data['subject_age'].apply(lambda x : convertAgeFeature(x))
    #skipColumnsList=['time','subject_age']
    arrest_data = encodeFeatures(arrest_data)   
    #data = data.to_numpy()
    #arrest_data=replaceNulls(data)
    return arrest_data

def getPreprocessedArrestDataWithoutRace():
    arrest_dataFrame = get_arrest_data2().copy()
    arrest_dataFrame = arrest_dataFrame.dropna()
    arrest_data = arrest_dataFrame[['subject_sex','subject_age','time','violation','frisk_performed','search_vehicle','warning_issued']].copy()
    #arrest_data = truncateTime(data)
    arrest_data['time'] = arrest_data['time'].apply(lambda x : truncateTime(x))
    arrest_data['subject_age'] = arrest_data['subject_age'].apply(lambda x : convertAgeFeature(x))
    #skipColumnsList=['time','subject_age']
    arrest_data = encodeFeatures(arrest_data)    
    #data = data.to_numpy()
    #arrest_data=replaceNulls(data)
    return arrest_data

    

In [None]:
# Use this for testing
arrest_data_df = Data_prep.get_arrest_data()
print(arrest_data_df)