In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [9]:
data = pd.read_csv('/Users/sarthaksinghgaur/iitm study/MLPp/train.csv')
test_data = pd.read_csv('/Users/sarthaksinghgaur/iitm study/MLPp/test.csv')

In [10]:
data_features = data.drop("Crime_Category", axis=1)
data_labels = data['Crime_Category'].copy()

In [11]:
def data_features_preprocessor(data_features, multilabel_binarizer=None, onehot_encoder=None):
    
    data_features.Victim_Age = data_features.Victim_Age.apply(lambda x: np.nan if x < 0 else x)
    data_features.Victim_Sex = data_features.Victim_Sex.apply(lambda x: np.nan if x not in ["M", "F", "X"] else x)
    data_features.Date_Reported = pd.to_datetime(data_features.Date_Reported, format='%m/%d/%Y %I:%M:%S %p')
    data_features.Date_Occurred = pd.to_datetime(data_features.Date_Occurred, format='%m/%d/%Y %I:%M:%S %p')
    
    data_features['Reported_Year'] = data_features.Date_Reported.dt.year
    data_features['Reported_Month'] = data_features.Date_Reported.dt.month
    data_features['Reported_Day'] = data_features.Date_Reported.dt.day
    data_features['Reported_DayOfWeek'] = data_features.Date_Reported.dt.dayofweek
    data_features['Occurred_Year'] = data_features.Date_Occurred.dt.year
    data_features['Occurred_Month'] = data_features.Date_Occurred.dt.month
    data_features['Occurred_Day'] = data_features.Date_Occurred.dt.day
    data_features['Occurred_DayOfWeek'] = data_features.Date_Occurred.dt.dayofweek
    data_features.Time_Occurred = data_features.Time_Occurred.astype(int).astype(str).str.zfill(4)
    data_features['Occurred_Hour'] = data_features.Time_Occurred.str.slice(0, 2).astype(int)
    data_features['Occurred_Minute'] = data_features.Time_Occurred.str.slice(2, 4).astype(int)
    
    data_features.drop(columns=['Cross_Street', 'Date_Reported', 'Date_Occurred', 'Time_Occurred', 'Area_Name', 'Status_Description', 'Premise_Description', 'Weapon_Description'], inplace=True)
    
    data_features.Weapon_Used_Code = data_features.Weapon_Used_Code.astype(str)
    data_features.Weapon_Used_Code.replace('nan', np.nan, inplace=True)
    imputer_constant = SimpleImputer(strategy='constant', fill_value='Unknown')
    imputer_mode = SimpleImputer(strategy='most_frequent')
    imputer_constant_columns = ['Modus_Operandi', 'Weapon_Used_Code', 'Victim_Sex', 'Victim_Descent']
    imputer_mode_columns = ['Victim_Age']
    for current_column in imputer_constant_columns:
        data_features[[current_column]] = imputer_constant.fit_transform(data_features[[current_column]])
    for current_column in imputer_mode_columns:
        data_features[[current_column]] = imputer_mode.fit_transform(data_features[[current_column]])
    
    if multilabel_binarizer:
        data_features['Modus_Operandi'] = data_features['Modus_Operandi'].apply(lambda x: x.split())
        binarized_modus_operandi = multilabel_binarizer.transform(data_features['Modus_Operandi'])
        binarized_modus_operandi_cols = ['Modus_Operandi:' + name for name in multilabel_binarizer.classes_]
        binarized_modus_operandi_df = pd.DataFrame(binarized_modus_operandi, columns=binarized_modus_operandi_cols)
        data_features = data_features.drop(columns='Modus_Operandi').join(binarized_modus_operandi_df)

    if onehot_encoder:
        onehotencoded_data = onehot_encoder.transform(data_features[['Location', 'Area_ID', 'Reporting_District_no', 'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Weapon_Used_Code', 'Status']])
        onehotencoded_cols = onehot_encoder.get_feature_names_out()
        onehotencoded_df = pd.DataFrame(onehotencoded_data, columns=onehotencoded_cols)
        data_features = data_features.drop(columns=['Location', 'Area_ID', 'Reporting_District_no', 'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Weapon_Used_Code', 'Status']).reset_index(drop=True)
        data_features = pd.concat([data_features, onehotencoded_df], axis=1)
    
    return data_features


multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data_features['Modus_Operandi'].fillna('Unknown').apply(lambda x: x.split()))

onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot_encoder.fit(data_features[['Location', 'Area_ID', 'Reporting_District_no', 'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Weapon_Used_Code', 'Status']])





In [12]:
data_features_transformed = data_features_preprocessor(data_features, multilabel_binarizer, onehot_encoder)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(data_features_transformed, data_labels , test_size=0.20, random_state=69)

In [14]:
# current_model = RandomForestClassifier(random_state=69) 
# current_model = GradientBoostingClassifier()
current_model = SVC()



In [15]:
current_model.fit(X_train, y_train)

In [16]:
y_pred = current_model.predict(X_val)

In [17]:
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.3f}')

Accuracy: 0.589


In [18]:

test_data_preprocessed = data_features_preprocessor(test_data, multilabel_binarizer, onehot_encoder)



In [19]:
kaggle_y_predict = current_model.predict(test_data_preprocessed)

Random forest Classifier :  
evaluation score = 91.9%   
test score = 91.6%


Gradient Boosting Classifier :  
evaluation score = 93.8%  
test score = 93.3%  


SVC Classifier :  
evaluation score =  58.9%    


