In [43]:
import numpy as np 
import pandas as pd
train = pd.read_csv("Data/train.csv")

In [44]:
train.columns

Index(['Location', 'Cross_Street', 'Latitude', 'Longitude', 'Date_Reported',
       'Date_Occurred', 'Time_Occurred', 'Area_ID', 'Area_Name',
       'Reporting_District_no', 'Part 1-2', 'Modus_Operandi', 'Victim_Age',
       'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Premise_Description',
       'Weapon_Used_Code', 'Weapon_Description', 'Status',
       'Status_Description', 'Crime_Category'],
      dtype='object')

In [45]:
target = train[['Crime_Category']]
train = train.drop(['Crime_Category'], axis=1)

In [46]:
replacement_dict = {'Property Crimes': 0,
                    'Violent Crimes': 1,
                    'Crimes against Public Order': 2,
                    'Fraud and White-Collar Crimes': 3,
                    'Crimes against Persons': 4,  # Add placeholders for missing values (if needed)
                    'Other Crimes': 5}  # Add placeholders for missing values (if needed)

def replace_values(data, replacement_dict):
  """
  Replaces values in a Series or DataFrame based on a replacement dictionary.
  """
  return np.where(data.isin(replacement_dict.keys()), data.replace(replacement_dict), data)

target = replace_values(target.copy(), replacement_dict)  # Copy to avoid modifying original data
target = pd.DataFrame(target, columns=['Crime_Category'])

In [47]:
# Convert the date columns to datetime format
train['Date_Reported'] = pd.to_datetime(train['Date_Reported'])
train['Date_Occurred'] = pd.to_datetime(train['Date_Occurred'])

# Calculate the difference in minutes
train['time_between_date_occured_and_reported'] = (train['Date_Reported'] - train['Date_Occurred']) / pd.Timedelta(minutes=1)

# Get the absolute difference in minutes
train['time_between_date_occured_and_reported'] = train['time_between_date_occured_and_reported'].abs().astype(int)

# Print the dataframe
print(train)

                                       Location  \
0       4500    CARPENTER                    AV   
1               45TH                         ST   
2        600 E  MARTIN LUTHER KING JR        BL   
3      14900    ORO GRANDE                   ST   
4       7100 S  VERMONT                      AV   
...                                         ...   
19995   5100 W  ADAMS                        BL   
19996  16900    ROSCOE                       BL   
19997   1000 S  SHENANDOAH                   ST   
19998    300 W  SEPULVEDA                    ST   
19999           DALTON                       AV   

                          Cross_Street  Latitude  Longitude Date_Reported  \
0                                  NaN   34.1522  -118.3910    2020-03-09   
1      ALAMEDA                      ST   34.0028  -118.2391    2020-02-27   
2                                  NaN   34.0111  -118.2653    2020-08-21   
3                                  NaN   34.2953  -118.4590    2020-11-08   
4 

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.3, random_state=42, stratify=target)

In [49]:
print("Train set size:", X_train.shape)
print("Train labels set size:", Y_train.shape)
print("Test set size:", X_val.shape)
print("Test labels set size:", Y_val.shape)

Train set size: (14000, 22)
Train labels set size: (14000, 1)
Test set size: (6000, 22)
Test labels set size: (6000, 1)


In [50]:
X_train = X_train.drop(['Location', 'Date_Reported',
       'Date_Occurred', 'Area_Name',
       'Modus_Operandi',
        'Premise_Description',
       'Weapon_Description', 'Status_Description','Cross_Street'], axis=1)
X_val = X_val.drop(['Location', 'Date_Reported',
       'Date_Occurred', 'Area_Name',
       'Modus_Operandi',
        'Premise_Description',
       'Weapon_Description', 'Status_Description','Cross_Street'], axis=1)


In [51]:
print("Train set size:", X_train.shape)
print("Train labels set size:", Y_train.shape)
print("Test set size:", X_val.shape)
print("Test labels set size:", Y_val.shape)

Train set size: (14000, 13)
Train labels set size: (14000, 1)
Test set size: (6000, 13)
Test labels set size: (6000, 1)


In [52]:
cols = X_train.columns

In [53]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

'''label_cross_street  = LabelEncoder()
X_train['Cross_Street'].fillna("missing")
X_val['Cross_Street'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Cross_Street'] = label_cross_street.fit_transform(X_train['Cross_Street'])
X_val['Cross_Street'] = label_cross_street.transform(X_val['Cross_Street'])'''

label_status  = LabelEncoder()
X_train['Status'].fillna("missing")
X_val['Status'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Status'] = label_status.fit_transform(X_train['Status'])
X_val['Status'] = label_status.transform(X_val['Status'])

label_victim_descent  = LabelEncoder()
X_train['Victim_Descent'].fillna("missing")
X_val['Victim_Descent'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Victim_Descent'] = label_victim_descent.fit_transform(X_train['Victim_Descent'])
X_val['Victim_Descent'] = label_victim_descent.transform(X_val['Victim_Descent'])

label_victim_sex  = LabelEncoder()
X_train['Victim_Sex'].fillna("missing")
X_val['Victim_Sex'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Victim_Sex'] = label_status.fit_transform(X_train['Victim_Sex'])
X_val['Victim_Sex'] = label_status.transform(X_val['Victim_Sex'])

In [54]:
X_train

Unnamed: 0,Latitude,Longitude,Time_Occurred,Area_ID,Reporting_District_no,Part 1-2,Victim_Age,Victim_Sex,Victim_Descent,Premise_Code,Weapon_Used_Code,Status,time_between_date_occured_and_reported
18760,33.9737,-118.2842,1545.0,12.0,1256.0,2.0,41.0,0,6,104.0,400.0,2,0
17978,34.0527,-118.3066,1915.0,20.0,2053.0,1.0,54.0,0,6,501.0,106.0,2,0
15431,34.2409,-118.5361,15.0,17.0,1767.0,1.0,36.0,2,6,707.0,,2,0
712,34.2661,-118.5132,1000.0,17.0,1735.0,1.0,28.0,0,14,501.0,,2,21600
9866,34.0216,-118.2569,915.0,13.0,1323.0,2.0,29.0,0,6,102.0,400.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,33.7333,-118.2906,1240.0,5.0,563.0,2.0,75.0,2,6,510.0,302.0,1,0
19110,33.9940,-118.4799,1800.0,14.0,1412.0,2.0,50.0,0,14,102.0,500.0,2,4320
10425,34.0754,-118.2595,2000.0,2.0,218.0,1.0,27.0,0,6,108.0,,2,2880
5369,34.1576,-118.4389,1649.0,9.0,964.0,1.0,34.0,2,10,405.0,,2,44640


In [55]:
#printing the column names with type object
object_columns = X_train.select_dtypes(include='object').columns
print("Columns with type 'object':", object_columns)

Columns with type 'object': Index([], dtype='object')


In [56]:
'''from sklearn.impute import KNNImputer
imputer = KNNImputer()
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)'''

'from sklearn.impute import KNNImputer\nimputer = KNNImputer()\nX_train = imputer.fit_transform(X_train)\nX_val = imputer.transform(X_val)'

In [57]:
#X_train = pd.DataFrame(X_train, columns=cols)
#X_val = pd.DataFrame(X_val, columns=cols)

In [58]:
from sklearn.impute import SimpleImputer
value = 99999
# Specify the strategy as 'constant' and the fill_value as your desired static value
imputer = SimpleImputer(strategy='constant', fill_value=value)


In [59]:
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

In [60]:
import xgboost as xg
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import Binarizer
#### Calculating Sample Weights
n_samples, n_labels = Y_train.shape
from collections import Counter
# Calculate sample weights for each label column
sample_weights = []

for label_idx,name in zip(range(n_labels), Y_train.columns):
    
    label_col = Y_train.values[:, label_idx]
    print(name, end = ' ')
    
    
    class_counts = Counter(label_col)
    print(class_counts, end = '  ')
    
    total_instances = sum(class_counts.values())
    
    class_weights = {cls: total_instances / (count * len(class_counts)) for cls, count in class_counts.items()}
    print(class_weights)
    
    label_weights = [class_weights[cls] for cls in label_col]
    sample_weights.append(label_weights)
    
sum(label_weights)/len(label_weights)
np.array(sample_weights).T.shape
# Combine the sample weights for all label columns
sample_weights = np.array(sample_weights).T
sample_weights = np.mean(sample_weights, axis=1)
sample_weights.shape

Crime_Category Counter({0: 8166, 1: 3337, 2: 1266, 3: 949, 4: 157, 5: 125})  {1: 0.6992308460693237, 2: 1.8430753027909426, 0: 0.2857376112335701, 3: 2.4587284861257466, 4: 14.8619957537155, 5: 18.666666666666668}


(14000,)

In [61]:
Y_train.value_counts()

Crime_Category
0                 8166
1                 3337
2                 1266
3                  949
4                  157
5                  125
dtype: int64

In [62]:
X_train

array([[ 3.397370e+01, -1.182842e+02,  1.545000e+03, ...,  4.000000e+02,
         2.000000e+00,  0.000000e+00],
       [ 3.405270e+01, -1.183066e+02,  1.915000e+03, ...,  1.060000e+02,
         2.000000e+00,  0.000000e+00],
       [ 3.424090e+01, -1.185361e+02,  1.500000e+01, ...,  9.999900e+04,
         2.000000e+00,  0.000000e+00],
       ...,
       [ 3.407540e+01, -1.182595e+02,  2.000000e+03, ...,  9.999900e+04,
         2.000000e+00,  2.880000e+03],
       [ 3.415760e+01, -1.184389e+02,  1.649000e+03, ...,  9.999900e+04,
         2.000000e+00,  4.464000e+04],
       [ 3.403970e+01, -1.182265e+02,  4.250000e+02, ...,  9.999900e+04,
         2.000000e+00,  0.000000e+00]])

In [63]:
#Y_train = Y_train.reset_index()

In [64]:
Y_train

Unnamed: 0,Crime_Category
18760,1
17978,2
15431,0
712,0
9866,1
...,...
8670,1
19110,1
10425,0
5369,0


In [65]:
X_train.shape, Y_train.shape

((14000, 13), (14000, 1))

In [66]:
Y_train.value_counts()

Crime_Category
0                 8166
1                 3337
2                 1266
3                  949
4                  157
5                  125
dtype: int64

In [67]:
# Convert Crime_Category to category type
Y_train['Crime_Category'] = Y_train['Crime_Category'].astype('category')
# Encode the categorical column to numeric values
Y_train['Crime_Category'] = Y_train['Crime_Category'].cat.codes
print(Y_train.dtypes)

Crime_Category    int8
dtype: object


In [68]:
# Convert Crime_Category to category type
Y_val['Crime_Category'] = Y_val['Crime_Category'].astype('category')
# Encode the categorical column to numeric values
Y_val['Crime_Category'] = Y_val['Crime_Category'].cat.codes
print(Y_train.dtypes)

Crime_Category    int8
dtype: object


In [69]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [70]:
import xgboost as xgb 
from sklearn.multiclass import OneVsOneClassifier, OneVsOneClassifier

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, Y_train, sample_weight=sample_weights)

In [71]:
from sklearn.metrics import classification_report
preds = xgb_model.predict(X_val)
print(classification_report(preds, Y_val))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      3107
           1       0.92      0.80      0.86      1635
           2       0.72      0.61      0.66       641
           3       0.87      0.69      0.77       507
           4       0.54      0.64      0.59        58
           5       0.17      0.17      0.17        52

    accuracy                           0.85      6000
   macro avg       0.68      0.65      0.66      6000
weighted avg       0.85      0.85      0.85      6000



In [72]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [73]:
from sklearn.metrics import classification_report
preds = lr.predict(X_val)
print(classification_report(preds, Y_val))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      3598
           1       0.89      0.69      0.78      1847
           2       0.13      0.68      0.21       102
           3       0.65      0.58      0.61       452
           4       0.01      1.00      0.03         1
           5       0.00      0.00      0.00         0

    accuracy                           0.79      6000
   macro avg       0.43      0.64      0.42      6000
weighted avg       0.86      0.79      0.82      6000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

base_classifier = xgb.XGBClassifier(max_depth=7, n_estimators=443, min_child_weight=0.0001208809225995048, gamma=4.220736409743197e-06, subsample=0.9345462189096938, colsample_bytree=0.8488892338575499, reg_alpha= 1.0149725817550121e-08, reg_lambda=0.00739996148194444)

# Wrap the base classifier with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(base_classifier)

# Train the OvR classifier
ovr_classifier.fit(X_train, Y_train)

# Make predictions on the test set
y_pred = ovr_classifier.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(Y_val, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:\n", classification_report(Y_val, y_pred))

Accuracy: 0.857
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.91      0.91      3500
           1       0.83      0.90      0.86      1430
           2       0.66      0.60      0.63       542
           3       0.75      0.77      0.76       406
           4       0.77      0.44      0.56        68
           5       0.18      0.04      0.06        54

    accuracy                           0.86      6000
   macro avg       0.68      0.61      0.63      6000
weighted avg       0.85      0.86      0.85      6000



In [75]:
'''from sklearn.model_selection import GridSearchCV
import xgboost as xgb
xgb = xgb.XGBClassifier()
# Define parameters
parameters_xg = {
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'max_depth': [3, 5, 7],
    'gamma': [0, 0.1, 0.5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, 2, 3]
}
from sklearn.metrics import make_scorer, f1_score
scorer = make_scorer(f1_score, average='weighted')

grid_search = GridSearchCV(xgb, parameters_xg, cv=10, scoring=scorer, verbose=True, n_jobs=-1)
grid_search.fit(X_train,Y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)'''



"from sklearn.model_selection import GridSearchCV\nimport xgboost as xgb\nxgb = xgb.XGBClassifier()\n# Define parameters\nparameters_xg = {\n    'learning_rate': [0.1, 0.01, 0.05],\n    'n_estimators': [100, 200, 300],\n    'min_child_weight': [1, 3, 5],\n    'max_depth': [3, 5, 7],\n    'gamma': [0, 0.1, 0.5],\n    'subsample': [0.8, 1.0],\n    'colsample_bytree': [0.8, 1.0],\n    'scale_pos_weight': [1, 2, 3]\n}\nfrom sklearn.metrics import make_scorer, f1_score\nscorer = make_scorer(f1_score, average='weighted')\n\ngrid_search = GridSearchCV(xgb, parameters_xg, cv=10, scoring=scorer, verbose=True, n_jobs=-1)\ngrid_search.fit(X_train,Y_train)\nprint(grid_search.best_params_)\nprint(grid_search.best_score_)\nprint(grid_search.best_estimator_)"

{'colsample_bytree': 1.0, 'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 1.0}


In [76]:
import xgboost as xgb 
from sklearn.multiclass import OneVsOneClassifier, OneVsOneClassifier

xgb_model = xgb.XGBClassifier(max_depth=7, n_estimators=200, min_child_weight=3, gamma=0.5, subsample=1.0, colsample_bytree=1.0, scale_pos_weight=1, learning_rate=0.1,objective='multi:softprob')
xgb_model.fit(X_train, Y_train)

Parameters: { "scale_pos_weight" } are not used.



In [77]:
from sklearn.metrics import classification_report
preds = xgb_model.predict(X_val)
print(classification_report(preds, Y_val))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      3430
           1       0.92      0.82      0.87      1606
           2       0.64      0.70      0.67       490
           3       0.82      0.78      0.80       423
           4       0.46      0.72      0.56        43
           5       0.06      0.38      0.10         8

    accuracy                           0.87      6000
   macro avg       0.63      0.72      0.65      6000
weighted avg       0.88      0.87      0.87      6000



In [78]:
import pickle
with open('/home/jampanasasank/Documents/crime-cast-forecasting-crime-categories/crime-cast-forecasting-crime-categories/xgboost_trial1.pkl', 'rb') as file:
  # Load the model from the file
  model = pickle.load(file)


In [79]:
from sklearn.metrics import classification_report
preds = model.predict(X_val)
print(classification_report(preds, Y_val))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      3430
           1       0.92      0.82      0.87      1606
           2       0.64      0.70      0.67       490
           3       0.82      0.78      0.80       423
           4       0.46      0.72      0.56        43
           5       0.06      0.38      0.10         8

    accuracy                           0.87      6000
   macro avg       0.63      0.72      0.65      6000
weighted avg       0.88      0.87      0.87      6000



We need to maximize the macro-f1 avg  

In [80]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
xgb = xgb.XGBClassifier()
# Define parameters
parameters_xg = {
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'max_depth': [3, 5, 7],
    'gamma': [0, 0.1, 0.5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, 2, 3]
}
from sklearn.metrics import make_scorer, f1_score
scorer = make_scorer(f1_score, average='macro')

grid_search = GridSearchCV(xgb, parameters_xg, cv=10, scoring=scorer, verbose=True, n_jobs=-1)
grid_search.fit(X_train,Y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)



Fitting 10 folds for each of 2916 candidates, totalling 29160 fits


Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

