## Importing modules and dataset

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
df = pd.read_csv(os.path.join(dirname, filename))
df.head()

## Preprocessing

#### Drop unnecessary columns

In [None]:
df1 = df.drop(['Title', 'Text'], axis=1)
df1.head()

#### Extract data from column names and rename them

In [None]:
columns = df1.columns
columns

In [None]:
M_reason = columns[1]
caw = columns[8]
print(M_reason + '\n\n' + caw)

In [None]:
df1.columns = ['city', 'M_reason', 'M_child', 'M_male', 'M_female', 'K_child', 'K_male', 'K_female', 'CAW_crime', 'CAW_adult', 'CAW_child']

In [None]:
columns = {i:j for i,j in zip(df1.columns, df.drop(['Title', 'Text'], axis=1).columns)}
del columns['M_reason']
del columns['CAW_crime']
columns

In [None]:
columns['K_child'] = columns['K_child'].splitlines()[1]
columns

In [None]:
for i in df1.columns:
    print(i, ':', df1[i].unique())

In [None]:
temp = [i.lstrip('0123456789: ').rstrip(' ,.') for i in M_reason.splitlines()[1:]]
murder = {temp[i]:i+1 for i in range(len(temp))}

murder['Property Disputes'] = murder.pop('Property/Land Disputes')
murder['Unknown reasons'] = murder.pop('Unknown/other')
murder

In [None]:
temp = [i.lstrip('0123456789. ').rstrip(' ,.') for i in caw.splitlines()[1:]]
CAW = {i+1:temp[i] for i in range(len(temp))}
CAW

#### Fill NaN values and set correct values and data types in columns

In [None]:
df1.fillna('0', inplace = True)

In [None]:
df1 = df1.astype({'city':'category', 
                  'M_reason':'category', 
                  'M_child':'uint8', 
                  'M_male':'uint8', 
                  'M_female':'uint8', 
                  'K_child':'uint8', 
                  'K_male':'uint8', 
                  'K_female':'uint8',
                  'CAW_crime':'string',
                  'CAW_adult':'uint8', 
                  'CAW_child':'uint8'})
# we can also use 
# df1 = df1.convert_dtypes()

In [None]:
df1.info()

In [None]:
temp = df1.CAW_crime.str.get_dummies(',')
temp.columns

In [None]:
temp['10'] += temp[' 10']
temp['8'] += temp[' 8']
temp.drop([' 10', ' 8'], axis=1, inplace=True)
temp.columns = temp.columns.astype('uint8')
temp.sort_index(axis=1, inplace=True)
temp.columns = ['CACAW_'+str(i) for i in temp.columns]
temp

In [None]:
df2 = pd.concat([df1, temp], axis=1).drop('CAW_crime', axis=1)
df2

## Visualising the dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
temp = df2.M_reason.value_counts()
print("Percentage of murder cases = ", (1-(temp['0']/temp.sum()))*100)

temp.drop('0', inplace=True)

plt.bar(temp.index,temp.values)
plt.xticks(rotation=45, ha='right')
plt.title('Number of murders for different reasons')
plt.ylabel('Count')
plt.show()

We see that 'Love Affairs' are the most common known reasons for murders

In [None]:
temp_df = df2.groupby('city').sum()
temp_df

In [None]:
fig, ax = plt.subplots(figsize=(20,8), ncols=3)
plt.tight_layout()

temp = temp_df.iloc[:,:3]
temp.plot(kind='bar', rot=0, ax=ax[0])
ax[0].set_title("Murder Cases Victims")
ax[0].legend([columns[col] for col in temp.columns])

temp = temp_df.iloc[:, 3:6]
temp.plot(kind='bar', rot=0, ax=ax[1])
ax[1].set_title("Kidnapping Cases Victims")
ax[1].legend([columns[col] for col in temp.columns])

temp = temp_df.iloc[:, 6:8]
temp.plot(kind='bar', rot=0, ax=ax[2])
ax[2].set_title("Crime Against Women Cases Victims")
ax[2].legend([columns[col] for col in temp.columns])
plt.show()

In [None]:
temp = temp_df.iloc[:, 9:]
temp

In [None]:
temp.T.plot(kind='bar', figsize=(20, 8), title='Number of Crime Against Women in each category', rot=45)

We see that a lot of cases are from the categories: CACAW_7, CACAW_8 and CACAW_10 i.e. 'Rape', 'Assault on Women with Intent to Outrage her Modesty' and 'Protection of Children from Sexual Offences Act'

In [None]:
df3 = df1.copy()

In [None]:
df3.CAW_crime = df3.CAW_crime.str.replace(" ", "").str.split(',')
df3 = df3.explode('CAW_crime')
df3.head()

In [None]:
df3 = df3.reset_index().rename(columns={'index':'case_number'}).astype({'CAW_crime':'uint8'})
df3.head()

In [None]:
temp_df = df3.groupby('CAW_crime').mean().drop(columns=['case_number'], index=[0, 11])
temp_df

In [None]:
temp_df.plot(kind='bar', figsize=(15,8))
plt.xticks(np.arange(temp_df.shape[0]), CAW.values(), rotation=45, ha='right')
plt.legend([columns[col] for col in temp_df.columns])
plt.show()

#### We see acid attacks leaving a lot of victims per case

In [None]:
df4 = df3[['M_reason', 'CAW_crime']]
df4 = df4[(df4.M_reason!='0')|(df4.CAW_crime!=0)]
df4.head()

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(df4.M_reason, df4.CAW_crime)
plt.yticks(np.arange(len(CAW))+1, CAW.values())
plt.grid('on')
plt.show()

#### We see that some Murders due to 'Family Dispute' happens with relation to other crimes against women falling in category of 'Cruelty by Husband/in-laws((Sec.498 A IPC)'

We notice that a lot data is incorrect in the dataset, like the name of city and the number of victims per case</br>
First issue can be resolved by extracting the first word from Text and getting the correct city name</br>
Second issue might require manual extraction of data from text and title</br>
It can also be done by implementing deeplearning models to classify text from the title</br>