In [None]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Load Data
crime_data = pd.read_csv("../input/up-crime-data-year-2019/MR data - Compiled Data Set.csv")
crime_data.drop(["Title", "Text"], axis=1, inplace=True)

# Get Columns Names
cols = crime_data.columns

In [None]:
# Percent of data misisng
nrow = crime_data.shape[0]
ncol = crime_data.shape[1]
(crime_data.isnull().sum().sum()*100)/(nrow*ncol)

Approx 76% data is missing. **HUGE!**
Requires Extensive Preprocessing

In [None]:
# Get column names with string data and numeric data. They will be proessed separately
str_cols = [col for col in cols if crime_data[col].dtype == "object"]
num_cols = list(set(cols) - set(str_cols))
print(num_cols,"\n\n",str_cols)

In [None]:
# Fill Numeric missing data with 0 because they are not involved
crime_data[num_cols] = crime_data[num_cols].fillna(0)

In [None]:
# Fill str_cols[1] column with "Missing or No Murder Involved"
crime_data[str_cols[1]] = crime_data[str_cols[1]].fillna("Missing or No Murder Involved")

In [None]:
# Fill str_cols[2] column with 0. This 0 value is encoded later to a proper string
crime_data[str_cols[2]] = crime_data[str_cols[2]].fillna(0)

In [None]:
# There are few rows with values 11 in str_cols[2] column. This is not possible looking at the column
# name. So these rows are dropped
ind = crime_data[crime_data[str_cols[2]].isin(["11"])].index
crime_data.drop(ind, axis=0, inplace=True)

nrow = nrow - len(ind)

In [None]:
# Readjust the index values to make for removed rows
crime_data.index = range(0,len(crime_data.index))

In [None]:
# Create a temporary dataframe representing the columns  which are created by separating values 
# of column str_cols[2]
df2 = pd.DataFrame({'0':[False]*nrow, '1':[False]*nrow, '2':[False]*nrow, '3':[False]*nrow,
                   '4':[False]*nrow, '5':[False]*nrow, '6':[False]*nrow, '7':[False]*nrow, '8':[False]*nrow,
                   '9':[False]*nrow, '10':[False]*nrow})

In [None]:
# Separate the column. Crucial Step

for i in range(0,len(crime_data)):
    vals = str(crime_data.loc[i,str_cols[2]]).split(",")
    for j in vals:
        j = j.strip()
        df2.loc[i,j] = True

In [None]:
# Dict to rename the column name of df2
dict = {'0':"Missing or not applicable" , '1':"Murder with Rape", '2':"Dowry Deaths", '3':"Suicide",
                   '4':"Kidnapping", '5':"Acid Attack", '6':"Cruelty by Husband/in-laws", '7':"Rape only", 
                   '8':"Assault on Women with Intent to Outrage her Modesty", '9':"Cyber Crimes against Women", 
                   '10':"Protection of Children from Sexual Offences Act"}

In [None]:
# Rename columsn of df2
new_cols = []
for x in df2.columns:
    new_cols.append(dict[x])
    
df2.columns = new_cols

In [None]:
# Remove column str_col[2] from original dataframe
crime_data.drop(str_cols[2], axis=1, inplace=True)

In [None]:
# Concat dataframes to get final dataframe
crime_data = pd.concat([crime_data, df2], axis=1)

In [None]:
# Final Cleaned DataFrame
crime_data

In [None]:
# Barplot showing the amount of crime recorded in each city
fig, ax = plt.subplots(figsize=(10,6))
sns.histplot(x=crime_data["City "], kde=False, linewidth=1.5)

In [None]:
# Barplot showing the Reasons of Murder if any recorded in the data
fig, ax = plt.subplots(figsize=(20,10))
sns.histplot(x=crime_data.iloc[:,1], kde=False, linewidth=1.5)
ax.set(xlabel="Reasons of Murder if any", ylabel = "Count")