In [85]:
import pandas as pd
import numpy as np

In [86]:
df = pd.read_csv('Crimes_Dataset.csv')
df.info()
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Index_Crimes           10000 non-null  int64  
 1   Date                   10000 non-null  object 
 2   Monster involved       10000 non-null  object 
 3   Days of Investigation  10000 non-null  float64
 4   Region                 9998 non-null   object 
 5   Crime Type             10000 non-null  object 
 6   Crime Weapon           7403 non-null   object 
 7   Time of Day            10000 non-null  object 
 8   Evidence Found         9995 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 703.3+ KB
Index_Crimes                0
Date                        0
Monster involved            0
Days of Investigation       0
Region                      2
Crime Type                  0
Crime Weapon             2597
Time of Day                 0
Evidence Found   

In [87]:
# make everything lower case

object_cols = df.select_dtypes(include='object').columns 
for attribute in object_cols:
    df[attribute] = df[attribute].str.lower()

In [88]:
# change NaN to 'unknown'

df['Region'] = df['Region'].fillna('unknown')
df['Crime Weapon'] = df['Crime Weapon'].fillna('unknown')
df['Crime Weapon'] = df['Crime Weapon'].replace('n/a', 'unknown')
df['Evidence Found'] = df['Evidence Found'].fillna('not found')

In [89]:
# check again if there is no NaN anymore

print(df.isnull().sum())

Index_Crimes             0
Date                     0
Monster involved         0
Days of Investigation    0
Region                   0
Crime Type               0
Crime Weapon             0
Time of Day              0
Evidence Found           0
dtype: int64


In [90]:
# check for duplicated rows

print(df.duplicated().sum())
print(df['Index_Crimes'].nunique() == len(df))  # we have the right Index_Crimes

0
True


In [91]:
# Now the Data Frame has no weird value anymore
# But we need to work on some patterns that appear on the Data Frame

# separate the year, month, date of the commited crime

df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# change data type

df['Days of Investigation'] = df['Days of Investigation'].astype(int)

In [92]:
# check for each unique entries

columns = df.columns.values

for i in range(len(columns)):
    print(f'Range of {columns[i]} = ', df[columns[i]].unique())

Range of Index_Crimes =  [6316 4731 1750 ... 5446  869 7341]
Range of Date =  <DatetimeArray>
['2020-02-18 00:00:00', '2022-09-01 00:00:00', '2022-08-03 00:00:00',
 '2023-10-18 00:00:00', '2021-03-25 00:00:00', '2023-12-04 00:00:00',
 '2022-02-19 00:00:00', '2021-11-15 00:00:00', '2024-05-04 00:00:00',
 '2021-04-25 00:00:00',
 ...
 '2021-09-13 00:00:00', '2023-05-25 00:00:00', '2023-12-12 00:00:00',
 '2021-06-29 00:00:00', '2024-10-27 00:00:00', '2019-11-19 00:00:00',
 '2023-07-25 00:00:00', '2021-06-28 00:00:00', '2022-05-03 00:00:00',
 '2023-02-01 00:00:00']
Length: 1820, dtype: datetime64[ns]
Range of Monster involved =  ['skeleton' 'werewolf' 'ghost' 'witch' 'zombie' 'vampire']
Range of Days of Investigation =  [77 48 31 29 59 56  3 83 11  7 91 32  2 58 49 65 61 90 62 17 13 97 33 68
 92 54 15 66 52 99 88 80 67 86 28 64  6 69 85 20 23 21 36 16 53 38 44 87
 18 14 55 26 73 84 75 60 95  4 96 35 39 57  5 50 43 12 72 19 71 22 42 25
 63  1 76 47 34 79 46 82 27 94 30 45  8 24 98 78 70 37 1

In [None]:
# sort the data from date

df = df.sort_values(by = 'Date')

# change Index_Crimes

df.reset_index(drop=True, inplace=True)
df['Index_Crimes'] = range(1, len(df)+1)

Unnamed: 0,Index_Crimes,Date,Monster involved,Days of Investigation,Region,Crime Type,Crime Weapon,Time of Day,Evidence Found,Year,Month,Day
0,1,2019-10-30,werewolf,18,forest,assault,pistol,night,fur,2019,10,30
1,2,2019-10-30,vampire,86,castle,assault,axe,night,blood,2019,10,30
2,3,2019-10-30,skeleton,25,castle,nightly disturbance,knife,night,cloak,2019,10,30
3,4,2019-10-30,zombie,6,swamp,vandalism,axe,night,teeth,2019,10,30
4,5,2019-10-30,witch,9,swamp,arson,unknown,night,bones,2019,10,30
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,2024-10-30,zombie,84,forest,murder,unknown,dawn,bones,2024,10,30
9996,9997,2024-10-30,witch,70,swamp,kidnapping,axe,day,bones,2024,10,30
9997,9998,2024-10-30,vampire,54,forest,nightly disturbance,knife,night,cloak,2024,10,30
9998,9999,2024-10-30,witch,84,village,arson,knife,dusk,potions,2024,10,30


In [94]:
# # export to csv file

# df.to_csv('cleanup_Crime_Dataset.csv', index = False)