In [62]:
import pandas as pd
import numpy as np

In [63]:
# load data

df = pd.read_csv('Crimes_Dataset.csv')
df.info()
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Index_Crimes           10000 non-null  int64  
 1   Date                   10000 non-null  object 
 2   Monster involved       10000 non-null  object 
 3   Days of Investigation  10000 non-null  float64
 4   Region                 9998 non-null   object 
 5   Crime Type             10000 non-null  object 
 6   Crime Weapon           7403 non-null   object 
 7   Time of Day            10000 non-null  object 
 8   Evidence Found         9995 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 703.3+ KB
Index_Crimes                0
Date                        0
Monster involved            0
Days of Investigation       0
Region                      2
Crime Type                  0
Crime Weapon             2597
Time of Day                 0
Evidence Found   

In [64]:
# make everything lower case

object_cols = df.select_dtypes(include='object').columns 
for attribute in object_cols:
    df[attribute] = df[attribute].str.lower()

In [65]:
# change NaN to 'unknown'

df['Region'] = df['Region'].fillna('unknown')
df['Crime Weapon'] = df['Crime Weapon'].fillna('unknown')
df['Crime Weapon'] = df['Crime Weapon'].replace('n/a', 'unknown')
df['Evidence Found'] = df['Evidence Found'].fillna('not found')

In [66]:
# check again if there is no NaN anymore

print(df.isnull().sum())

Index_Crimes             0
Date                     0
Monster involved         0
Days of Investigation    0
Region                   0
Crime Type               0
Crime Weapon             0
Time of Day              0
Evidence Found           0
dtype: int64


In [67]:
# convert datatypes

df['Date'] = pd.to_datetime(df['Date'])
df['Days of Investigation'] = df['Days of Investigation'].astype(int)

In [68]:
# check for duplicated rows

print(df.duplicated().sum())
print(df['Index_Crimes'].nunique() == len(df))  # Index Crimes is not duplicated

0
True


In [69]:
# separate the year, month, date of the commited crime

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.strftime('%B')
df['Day'] = df['Date'].dt.strftime('%A')

In [70]:
# check for each unique entries

columns = df.columns.values

for i in range(len(columns)):
    print(f'Range of {columns[i]} = ', df[columns[i]].unique())

Range of Index_Crimes =  [6316 4731 1750 ... 5446  869 7341]
Range of Date =  <DatetimeArray>
['2020-02-18 00:00:00', '2022-09-01 00:00:00', '2022-08-03 00:00:00',
 '2023-10-18 00:00:00', '2021-03-25 00:00:00', '2023-12-04 00:00:00',
 '2022-02-19 00:00:00', '2021-11-15 00:00:00', '2024-05-04 00:00:00',
 '2021-04-25 00:00:00',
 ...
 '2021-09-13 00:00:00', '2023-05-25 00:00:00', '2023-12-12 00:00:00',
 '2021-06-29 00:00:00', '2024-10-27 00:00:00', '2019-11-19 00:00:00',
 '2023-07-25 00:00:00', '2021-06-28 00:00:00', '2022-05-03 00:00:00',
 '2023-02-01 00:00:00']
Length: 1820, dtype: datetime64[ns]
Range of Monster involved =  ['skeleton' 'werewolf' 'ghost' 'witch' 'zombie' 'vampire']
Range of Days of Investigation =  [77 48 31 29 59 56  3 83 11  7 91 32  2 58 49 65 61 90 62 17 13 97 33 68
 92 54 15 66 52 99 88 80 67 86 28 64  6 69 85 20 23 21 36 16 53 38 44 87
 18 14 55 26 73 84 75 60 95  4 96 35 39 57  5 50 43 12 72 19 71 22 42 25
 63  1 76 47 34 79 46 82 27 94 30 45  8 24 98 78 70 37 1

In [71]:
# create new dataframe to encode

df_encoded = df.copy()

In [72]:
# label encoding categorical data

# categorical columns = 'Date', 'Monster involved', 'Region', 'Crime Type', 'Crime Weapon', 'Time of Day', 'Evidence Found', 'Year', 'Month', 'Day'
categ_columns = np.delete(columns, [0, 3])

for i in range(len(categ_columns)):
    codes, uniques = pd.factorize(df_encoded[categ_columns[i]])
    mapping = dict(enumerate(uniques))
    print(mapping)
    df_encoded[categ_columns[i]] = codes

# we don't need this code now unless we want a machine learning model

{0: Timestamp('2020-02-18 00:00:00'), 1: Timestamp('2022-09-01 00:00:00'), 2: Timestamp('2022-08-03 00:00:00'), 3: Timestamp('2023-10-18 00:00:00'), 4: Timestamp('2021-03-25 00:00:00'), 5: Timestamp('2023-12-04 00:00:00'), 6: Timestamp('2022-02-19 00:00:00'), 7: Timestamp('2021-11-15 00:00:00'), 8: Timestamp('2024-05-04 00:00:00'), 9: Timestamp('2021-04-25 00:00:00'), 10: Timestamp('2021-11-08 00:00:00'), 11: Timestamp('2022-06-21 00:00:00'), 12: Timestamp('2023-08-02 00:00:00'), 13: Timestamp('2021-06-04 00:00:00'), 14: Timestamp('2021-12-16 00:00:00'), 15: Timestamp('2021-02-21 00:00:00'), 16: Timestamp('2021-03-21 00:00:00'), 17: Timestamp('2024-02-11 00:00:00'), 18: Timestamp('2024-09-09 00:00:00'), 19: Timestamp('2024-02-14 00:00:00'), 20: Timestamp('2023-04-03 00:00:00'), 21: Timestamp('2021-03-20 00:00:00'), 22: Timestamp('2022-01-09 00:00:00'), 23: Timestamp('2024-04-20 00:00:00'), 24: Timestamp('2020-08-03 00:00:00'), 25: Timestamp('2021-08-24 00:00:00'), 26: Timestamp('2022-0

In [73]:
# # one-hot encoding categorical data

# df_encoded = pd.get_dummies(df_encoded, columns=categ_columns)

# eventhough this method is recommended in the challenge, but we don't need this code at all.
# the alanysing method isn't our level.

In [74]:
# normalize numerical data
# numerical columns = 'Days of Investigation'

df_encoded['Days of Investigation'] = (df_encoded['Days of Investigation'] - df_encoded['Days of Investigation'].min()) / (df_encoded['Days of Investigation'].max() - df_encoded['Days of Investigation'].min())
# we don't need this code now unless we want a machine learning model

In [75]:
# sort the data from date

df = df.sort_values(by = 'Date')

# change Index_Crimes

df.reset_index(drop=True, inplace=True)
df['Index_Crimes'] = range(1, len(df)+1)

In [76]:
# export to csv file

df.to_csv('cleanup_Crime_Dataset.csv', index = False)
df_encoded.to_csv('encoded_cleanup_Crime_Dataset.csv', index = False)