In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [14]:
data = pd.read_csv('/Users/sarthaksinghgaur/iitm study/MLPp/train.csv')
test_data = pd.read_csv('/Users/sarthaksinghgaur/iitm study/MLPp/test.csv')

In [15]:
data_features = data.drop("Crime_Category", axis=1)
data_labels = data['Crime_Category'].copy()

In [16]:
data_features.Victim_Age = data_features.Victim_Age.apply(lambda x: np.nan if x<0 else x)

In [17]:
data_features.Victim_Sex = data_features.Victim_Sex.apply(lambda x: np.nan if x not in ["M","F","X"] else x)

In [18]:
data_features.Date_Reported = pd.to_datetime(data_features.Date_Reported, format = '%m/%d/%Y %I:%M:%S %p')
data_features.Date_Occurred = pd.to_datetime(data_features.Date_Occurred, format = '%m/%d/%Y %I:%M:%S %p')

In [19]:
data_features['Reported_Year'] = data_features.Date_Reported.dt.year
data_features['Reported_Month'] = data_features.Date_Reported.dt.month
data_features['Reported_Day'] = data_features.Date_Reported.dt.day
data_features['Reported_DayOfWeek'] = data_features.Date_Reported.dt.dayofweek

data_features['Occurred_Year'] = data_features.Date_Occurred.dt.year
data_features['Occurred_Month'] = data_features.Date_Occurred.dt.month
data_features['Occurred_Day'] = data_features.Date_Occurred.dt.day
data_features['Occurred_DayOfWeek'] = data_features.Date_Occurred.dt.dayofweek

In [20]:
data_features.Time_Occurred = data_features.Time_Occurred.astype(int).astype(str).str.zfill(4)
data_features['Occurred_Hour'] = data_features.Time_Occurred.str.slice(0,2).astype(int)
data_features['Occurred_Minute'] = data_features.Time_Occurred.str.slice(2,4).astype(int)

In [29]:
data_features.drop(columns=['Cross_Street','Date_Reported','Date_Occurred','Time_Occurred','Area_Name','Status_Description','Premise_Description','Weapon_Description'],inplace = True)

In [31]:
data_features.Weapon_Used_Code = data_features.Weapon_Used_Code.astype(str)
data_features.Weapon_Used_Code.replace('nan', np.nan, inplace=True)

In [33]:
imputer_constant = SimpleImputer(strategy = 'constant', fill_value = 'Unknown')
imputer_mode = SimpleImputer(strategy = 'most_frequent')

In [35]:
imputer_constant_columns = ['Modus_Operandi','Weapon_Used_Code','Victim_Sex','Victim_Descent']
imputer_mode_columns = ['Victim_Age']

In [37]:
for current_column in imputer_constant_columns:
    data_features[[current_column]]=imputer_constant.fit_transform(data_features[[current_column]])
for current_column in imputer_mode_columns:
    data_features[[current_column]]=imputer_mode.fit_transform(data_features[[current_column]])

In [39]:
data_features.Modus_Operandi = data_features.Modus_Operandi.apply(lambda x: x.split())

In [41]:
multilabelbinarizer = MultiLabelBinarizer()
binarized_modus_operandi = multilabelbinarizer.fit_transform(data_features['Modus_Operandi'])
binarized_modus_operandi_cols = ['Modus_Operandi:' + name for name in multilabelbinarizer.classes_]
binarized_modus_operandi_df = pd.DataFrame(binarized_modus_operandi,columns = binarized_modus_operandi_cols)

In [43]:
data_features = data_features.drop(columns='Modus_Operandi')
data_features = data_features.join(binarized_modus_operandi_df)

In [45]:
onehotencoded_cols = ['Location','Area_ID','Reporting_District_no','Victim_Sex','Victim_Descent','Premise_Code','Weapon_Used_Code','Status']
# tfidf_cols = ['Premise_Description','Weapon_Description']
# onehotencoded_cols = []
# tfidf_cols = []

In [47]:
column_transformer = ColumnTransformer(
    transformers=[('onehotencoder', OneHotEncoder(sparse=False), onehotencoded_cols)]
                  ,remainder='passthrough'
)
data_features_transformed = column_transformer.fit_transform(data_features)
data_features_transformed = pd.DataFrame(data_features_transformed)



In [49]:
# sns.set()
# data.Crime_Category.hist()

In [51]:
# split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 69)
# for train_index, test_index in split.split(data_features_transformed, data_labels):
#   train_data = data_features_transformed.loc[train_index]
#   test_data = data_features_transformed.loc[test_index]
# print(train_data.shape, test_data.shape)

In [53]:
# strat_dist = test_data["Crime_Category"].value_counts() / len(test_data)
# overall_dist = data["Crime_Category"].value_counts() / len(data)
# dist_comparison = pd.DataFrame({'overall': overall_dist, 'stratified': strat_dist})
# dist_comparison['diff(s-o)'] = dist_comparison['stratified'] - dist_comparison['overall']
# dist_comparison['diff(s-o)_pct'] = 100*(dist_comparison['diff(s-o)']/dist_comparison['overall'])
# dist_comparison

In [55]:
X_train, X_val, y_train, y_val = train_test_split(data_features_transformed, data_labels , test_size=0.20, random_state=69)

In [58]:
randomforestclassifier = RandomForestClassifier(random_state=69)
randomforestclassifier.fit(X_train, y_train)

In [59]:
y_pred = randomforestclassifier.predict(X_val)

In [60]:
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.93
