<img src= "https://www.aljazeera.com/wp-content/uploads/2020/08/853cae263ce746368d5ccae0c5027a39_18.jpeg?resize=770%2C513">

# Topics

1. Importing Libraries
2. Data Reading, Formatting and Cleaning
3. Analysis and Visualization
4. Conclusion

# Importing Libraries

In [None]:
# Libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import re

import warnings
warnings.filterwarnings('ignore')

# Data Reading, Formatting and Cleaning

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Reading the data

df = pd.read_csv('/kaggle/input/violence-against-women-turkey-20082021/Violence Against Women_Turkey.csv')
df.head()

In [None]:
# Which province has highest number of cases

df['Province'].value_counts(normalize = True)[:10]

In [None]:
# The first column is a empty column and we will drop it

df.drop('Unnamed: 0',axis = 1, inplace = True)

In [None]:
# Are there any rows which have all the NaN values

df[df.isnull().all(1)]

In [None]:
# Dropping row with all NaN values

df.dropna(axis = 0, how = 'all', inplace = True)

In [None]:
# Let's see the dtypes

df.info()

In [None]:
# Missing value percentage in each column

round(100*df.isnull().sum()/len(df))

In [None]:
# Dropping the Date Column as 50 % values are missing and going forward with it will give wrong idea about the timeline
# Can't extract anything useful from 'Name' column and we will drop it

df.drop(['Date', 'Name'], axis = 1, inplace = True)

In [None]:
# We will remove all the column with more 60% of missing values

for col in df.columns:
    if (df[col].isnull().sum()/len(df)) > 0.60:
        df.drop(col, axis = 1, inplace = True)

In [None]:
# Rechecking Missing value percentage in each column

round(100*df.isnull().sum()/len(df))

We will impute the remaining missing values of News Source 1 as Unknown source instead of removing that column

#### Muder Motive

In [None]:
# There are empty spaces 

df['Murder Motive'] = df['Murder Motive'].str.strip()

In [None]:
# Renaming some values for ease of understanding

df['Murder Motive'] = df['Murder Motive'].replace(['Not Determined','For', "Men's something he wanted to happen because",'Separation Request', 'Up rejected','house'], 
                                                  ['Unknown','Unknown','Unknown','Divorce Request', 'Rejection', 'House'])

In [None]:
# Renaming other values as others Motives

insignificant_motives = pd.Series(df['Murder Motive'].unique())
insignificant_motives = list(insignificant_motives[~insignificant_motives.isin(df['Murder Motive'].value_counts().keys()[:10])])
df['Murder Motive'] = df['Murder Motive'].apply(lambda x : 'Other Motives' if x in insignificant_motives else x)

In [None]:
# Looking into the value count

df['Murder Motive'].value_counts(normalize = True)

#### Murderer Name

In [None]:
# There are empty spaces 

df['Murderer Name'] = df['Murderer Name'].str.strip()

In [None]:
# We will rename Not Determined to Unknown

df['Murderer Name'] = df['Murderer Name'].replace(['Not Determined','someone unfamiliar'], 'Unknown')

In [None]:
# Renaming other values as others People

insignificant_people = pd.Series(df['Murderer Name'].unique())
insignificant_people = list(insignificant_people[~insignificant_people.isin(df['Murderer Name'].value_counts().keys()[:10])])
df['Murderer Name'] = df['Murderer Name'].apply(lambda x : 'Other People' if x in insignificant_people else x)

In [None]:
# Looking into the value count

df['Murderer Name'].value_counts(normalize = True)

#### Protection Request

In [None]:
# There are empty spaces 

df['Protection Request'] = df['Protection Request'].str.strip()

In [None]:
# We will rename Unknow to Not Determined

df['Protection Request'] = df['Protection Request'].replace(['Unknown','Unrealized', 'Where', 'no'], ['Not Determined', 'Not Determined','Not Determined','No'])

In [None]:
# Renaming other values as others Request

insignificant_request = pd.Series(df['Protection Request'].unique())
insignificant_request = list(insignificant_request[~insignificant_request.isin(df['Protection Request'].value_counts().keys()[:3])])
df['Protection Request'] = df['Protection Request'].apply(lambda x : 'Other Request' if x in insignificant_request else x)

In [None]:
# Looking into the value count

df['Protection Request'].value_counts(normalize = True)

#### Way Of Killing

In [None]:
# There are empty spaces 

df['Way Of Killing'] = df['Way Of Killing'].str.strip()

In [None]:
# We will rename Unknow to Not Determined

df['Way Of Killing'] = df['Way Of Killing'].replace(['Not Determined','DARPA'], ['Unknown', 'Darpa'])

In [None]:
# # Renaming other values as Other Ways

insignificant_ow = pd.Series(df['Way Of Killing'].unique())
insignificant_ow = list(insignificant_ow[~insignificant_ow.isin(df['Way Of Killing'].value_counts().keys()[:9])])
df['Way Of Killing'] = df['Way Of Killing'].apply(lambda x : 'Other Ways' if x in insignificant_ow else x)

In [None]:
# Looking into the value count

df['Way Of Killing'].value_counts(normalize = True)

#### News Source 1

In [None]:
# There are empty spaces 

df['News Source 1'] = df['News Source 1'].str.strip()

In [None]:
# We will fill the missing values as Unknown Source

df['News Source 1'].fillna('Unknown Source', inplace = True)

In [None]:
# Extracting the domain name

def domain_name(url):
    return url.split("www.")[-1].split("//")[-1].split(".")[0]

df['News Source 1'] = df['News Source 1'].apply(domain_name)

In [None]:
# Renaming other values as Other Source

insignificant_source = pd.Series(df['News Source 1'].unique())
insignificant_source = list(insignificant_source[~insignificant_source.isin(df['News Source 1'].value_counts().keys()[:10])])
df['News Source 1'] = df['News Source 1'].apply(lambda x : 'Other Source' if x in insignificant_source else x)

In [None]:
# Looking into the value count

df['News Source 1'].value_counts(normalize = True)[:10]

In [None]:
# Final data frame

df.head()

# Analysis and Visualization

In [None]:
# Plotting countplots

figure = plt.figure(figsize = (15,10))
plt.suptitle('Top Reasons for',fontsize =25)

for idx, col in enumerate(df.columns[:4]):
    plt.subplot(2,2,idx+1)
    plt.title("{0}".format(col), fontsize = 20)
    
    sns.countplot(y = col, data = df, palette='Set1', order = df[col].value_counts().index)
    
    plt.ylabel(col,Fontsize = 20 )
    plt.xlabel('Frequency',Fontsize = 20)
    
    plt.xticks(rotation = 'horizontal', Fontsize = 15)
    plt.yticks(Fontsize = 15)
    
    figure.tight_layout(pad= 3)

In [None]:
# Plotting graph for News source 1

sns.countplot(y = 'News Source 1', data =df, palette = 'Set1',order = df['News Source 1'].value_counts().index)

plt.title('Reports published by Source', fontsize = 20)
plt.ylabel('News Source 1',Fontsize = 20 )
plt.xlabel('Frequency',Fontsize = 20)

plt.xticks(Fontsize = 15)
plt.yticks(Fontsize = 15)

plt.show()

# Conclusion
- Extracting from the data which was available `Istanbul` province has highest number of cases
- From the known motives `Discussion` causes **22%** of murder and there are other small motives which are collective grouped as `Other Motives` also seem to be primary cause
- `Husband` is the primary murderer contributing to **32%** to Murder cases
-  **87%** of the times there is `No` protection request
- Most of women have been killed by `Firearm` **44.6%** of cases and `Cutting Tools` **22.6%** of cases
- `Milliyet` reports **11%** of news and other non-primary small sources collectively report **21.7%**

## My Other Works

**Internet Usage: EDA and Cluster Analysis**:https://www.kaggle.com/vishalraibagi/internet-usage-eda-and-cluster-analysis
<br>

**Price Class Classification: SweetViz & 5 Models**:https://www.kaggle.com/vishalraibagi/price-class-classification-sweetviz-5-models
<br>
<br>
<br>

## Upvote if you like my work❤️
## If you have any queries, doubt or any suggestion feel free to drop it in comment section¶