**Domestic violence in Colombia**

* Data Cleansing
* Data Visualization
* Build the Model



In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
raw_data=pd.read_csv(r'../input/domestic-violence-in-colombia/Reporte_Delito_Violencia_Intrafamiliar_Polic_a_Nacional.csv')
raw_data.head()

In [None]:
raw_data.info()

In [None]:
raw_data.isnull().sum()

**Data Cleansing**

All NA values were replaced with the most frequent value in their column

In [None]:
data=raw_data.copy()
data['ARMAS MEDIOS'].fillna(data['ARMAS MEDIOS'].mode()[0], inplace = True)
data['GENERO'].fillna(data['GENERO'].mode()[0], inplace = True)
data['GRUPO ETARIO'].fillna(data['GRUPO ETARIO'].mode()[0], inplace = True)
data['CODIGO DANE'].fillna(data['CODIGO DANE'].mode()[0], inplace = True)

In [None]:
data.isnull().sum()

In [None]:
data['ARMAS MEDIOS'].unique()

In [None]:
data

**Visualising the Data**


In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data['DEPARTAMENTO'])
plt.xticks(rotation=90)
plt.show()

In [None]:
data2=data.copy()
data2=data2.drop(['FECHA HECHO', 'CODIGO DANE'], axis=1)
data2.head()

In [None]:
data2['ARMAS MEDIOS'].unique()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data['ARMAS MEDIOS'])
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data['CANTIDAD'])
plt.xticks(rotation=90)
plt.show()

**In order to be able to work with the Weapon's data, it should be mapped with numbers**



In [None]:
data2['ARMAS MEDIOS'] = data2['ARMAS MEDIOS'].map({'ARMA BLANCA / CORTOPUNZANTE': 1, 'ARMA DE FUEGO': 2, 'CONTUNDENTES':3, 'CORTANTES': 4, 'CORTOPUNZANTES': 5,'NO REPORTADO': 3, 'PUNZANTES': 6, 'SIN EMPLEO DE ARMAS': 7,
        'NO REPORTA':3, 'ESCOPOLAMINA': 8 ,'-': 3})

In [None]:
data2['ARMAS MEDIOS']

In [None]:
data['MUNICIPIO'].mode()

In [None]:
data2['MUNICIPIO']=data2['MUNICIPIO'].replace({'NO REPORTA':  'BOGOTÁ D.C. (CT)', '-': 'BOGOTÁ D.C. (CT)'})

In [None]:
data['DEPARTAMENTO'].mode()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data2['DEPARTAMENTO'])
plt.xticks(rotation=90)
plt.show()

In [None]:
data2['DEPARTAMENTO']=data2['DEPARTAMENTO'].replace({'NO REPORTA':  'CUNDINAMARCA', '-': 'CUNDINAMARCA'})

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data['GRUPO ETARIO'])
plt.xticks(rotation=90)
plt.show()

In [None]:
data2['GRUPO ETARIO']=data2['GRUPO ETARIO'].replace({'NO REPORTA': 'ADULTOS'})

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data2['GRUPO ETARIO'])
plt.xticks(rotation=90)
plt.show()

In [None]:
data2['GENERO']=data2['GENERO'].replace({'-': 'FEMENINO', 'NO REPORTA': 'FEMENINO'})

In [None]:
data2['GENERO'].unique()

In [None]:
data2.head()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data2['GENERO'])
plt.xticks(rotation=90)
plt.show()

In [None]:
trainx=data2.copy()
trainx=trainx.drop('ARMAS MEDIOS', axis=1)

trainy=data2['ARMAS MEDIOS']

In [None]:
trainx_encoded = pd.get_dummies(trainx, columns = ['MUNICIPIO', 'DEPARTAMENTO', 'GENERO','GRUPO ETARIO'])

In [None]:
trainx

In [None]:
trainx_encoded.columns.unique()

In [None]:
trainx_encoded

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(trainx_encoded, trainy, test_size=0.2, random_state=42)

In [None]:
X_train.info()

In [None]:
X_test

**Building a Model**

As there are multiple discrete outputs, *multinomial logistic regression model* is used for this case




In [None]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, y_train = make_classification(n_samples=381576, n_features=1059, n_informative=5, n_redundant=5, n_classes=3, random_state=12)
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# fit the model on the whole dataset
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
X_test, y_test = make_classification(n_samples=95394, n_features=1059, n_informative=5, n_redundant=5, n_classes=3, random_state=12)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
model.score(X_test, y_test)

In [None]:
print('Train Accuracy: %.2f' % (model.score(X_train, y_train)))
print('Test Accuracy: %.2f' % (model.score(X_test, y_test)))