In [1]:
#Reading the data
import numpy as np 
import pandas as pd
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")

In [2]:
train.shape, test.shape

((20000, 22), (5000, 21))

In [3]:
train.columns

Index(['Location', 'Cross_Street', 'Latitude', 'Longitude', 'Date_Reported',
       'Date_Occurred', 'Time_Occurred', 'Area_ID', 'Area_Name',
       'Reporting_District_no', 'Part 1-2', 'Modus_Operandi', 'Victim_Age',
       'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Premise_Description',
       'Weapon_Used_Code', 'Weapon_Description', 'Status',
       'Status_Description', 'Crime_Category'],
      dtype='object')

In [4]:
train['Crime_Category'].value_counts()

Property Crimes                  11666
Violent Crimes                    4767
Crimes against Public Order       1808
Fraud and White-Collar Crimes     1355
Crimes against Persons             225
Other Crimes                       179
Name: Crime_Category, dtype: int64

In [5]:
replacement_dict = {'Property Crimes': 0,
                    'Violent Crimes': 1,
                    'Crimes against Public Order': 2,
                    'Fraud and White-Collar Crimes': 3,
                    'Crimes against Persons': 4,  # Add placeholders for missing values (if needed)
                    'Other Crimes': 5} 
train['Crime_Category'] = train['Crime_Category'].replace(replacement_dict)


In [6]:
# Convert the date columns to datetime format
train['Date_Reported'] = pd.to_datetime(train['Date_Reported'])
train['Date_Occurred'] = pd.to_datetime(train['Date_Occurred'])

# Calculate the difference in minutes
train['time_between_date_occured_and_reported'] = (train['Date_Reported'] - train['Date_Occurred']) / pd.Timedelta(minutes=1)

# Get the absolute difference in minutes
train['time_between_date_occured_and_reported'] = train['time_between_date_occured_and_reported'].abs().astype(int)

# Print the dataframe
print(train)

                                       Location  \
0       4500    CARPENTER                    AV   
1               45TH                         ST   
2        600 E  MARTIN LUTHER KING JR        BL   
3      14900    ORO GRANDE                   ST   
4       7100 S  VERMONT                      AV   
...                                         ...   
19995   5100 W  ADAMS                        BL   
19996  16900    ROSCOE                       BL   
19997   1000 S  SHENANDOAH                   ST   
19998    300 W  SEPULVEDA                    ST   
19999           DALTON                       AV   

                          Cross_Street  Latitude  Longitude Date_Reported  \
0                                  NaN   34.1522  -118.3910    2020-03-09   
1      ALAMEDA                      ST   34.0028  -118.2391    2020-02-27   
2                                  NaN   34.0111  -118.2653    2020-08-21   
3                                  NaN   34.2953  -118.4590    2020-11-08   
4 

In [7]:
# Convert the date columns to datetime format
test['Date_Reported'] = pd.to_datetime(test['Date_Reported'])
test['Date_Occurred'] = pd.to_datetime(test['Date_Occurred'])

# Calculate the difference in minutes
test['time_between_date_occured_and_reported'] = (test['Date_Reported'] - test['Date_Occurred']) / pd.Timedelta(minutes=1)

# Get the absolute difference in minutes
test['time_between_date_occured_and_reported'] = test['time_between_date_occured_and_reported'].abs().astype(int)

# Print the dataframe
print(test)

                                      Location  \
0      1500    LEIGHTON                     AV   
1       100 S  NORMANDIE                    AV   
2       300 E  111TH                        ST   
3      1300 S  LA BREA                      AV   
4     11000    MORRISON                     ST   
...                                        ...   
4995   4600    MASCOT                       ST   
4996   2200 E  7TH                          ST   
4997           LANGDON                      AV   
4998    400 E  5TH                          ST   
4999  10100 S  SAN PEDRO                    ST   

                         Cross_Street  Latitude  Longitude Date_Reported  \
0                                 NaN   34.0128  -118.3045    2020-03-03   
1                                 NaN   34.0726  -118.3029    2020-06-01   
2                                 NaN   33.9348  -118.2695    2020-08-28   
3                                 NaN   34.0497  -118.3442    2020-12-23   
4                  

In [8]:
X_train = train.drop(['Location', 'Date_Reported',
       'Date_Occurred', 'Area_Name',
       'Modus_Operandi',
        'Premise_Description',
       'Weapon_Description', 'Status_Description','Crime_Category','Cross_Street'], axis=1)
X_test = test.drop(['Location', 'Date_Reported',
       'Date_Occurred', 'Area_Name',
       'Modus_Operandi',
        'Premise_Description',
       'Weapon_Description', 'Status_Description','Cross_Street'], axis=1)
Y_train = train[['Crime_Category']]

In [9]:
train['Victim_Descent'].value_counts()

H    6143
W    4245
B    2881
X    1849
O    1603
A     427
K      83
C      51
F      48
J      16
I      11
V       8
P       4
Z       3
U       2
D       1
G       1
Name: Victim_Descent, dtype: int64

In [10]:
test['Victim_Descent'].value_counts()

H    1536
W    1088
B     711
X     440
O     405
A     132
K      19
F      10
C       4
V       4
J       3
Z       1
I       1
G       1
P       1
S       1
Name: Victim_Descent, dtype: int64

In [11]:
set(train['Victim_Descent'].unique())

{'A',
 'B',
 'C',
 'D',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'O',
 'P',
 'U',
 'V',
 'W',
 'X',
 'Z',
 nan}

In [12]:
set(test['Victim_Descent'].unique())

{'A',
 'B',
 'C',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'O',
 'P',
 'S',
 'V',
 'W',
 'X',
 'Z',
 nan}

In [13]:
set(test['Victim_Descent'].unique()) - set(train['Victim_Descent'].unique())

{'S'}

In [14]:
new_categories = set(test['Victim_Descent'].unique()) - set(train['Victim_Descent'].unique())

In [15]:
X_test['Victim_Descent'] = X_test['Victim_Descent'].replace(to_replace=new_categories, value=np.nan)

In [16]:
set(X_test['Victim_Descent'].unique()) - set(X_train['Victim_Descent'].unique())

set()

In [17]:
set(X_test['Victim_Descent'].unique())

{'A',
 'B',
 'C',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'O',
 'P',
 'V',
 'W',
 'X',
 'Z',
 nan}

In [18]:
'''new_categories = set(test['Cross_Street'].unique()) - set(train['Cross_Street'].unique())
X_test['Cross_Street'] = X_test['Cross_Street'].replace(to_replace=new_categories, value=np.nan)'''

"new_categories = set(test['Cross_Street'].unique()) - set(train['Cross_Street'].unique())\nX_test['Cross_Street'] = X_test['Cross_Street'].replace(to_replace=new_categories, value=np.nan)"

In [19]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

'''label_cross_street  = LabelEncoder()
X_train['Cross_Street'].fillna("missing")
X_test['Cross_Street'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Cross_Street'] = label_cross_street.fit_transform(X_train['Cross_Street'])
X_test['Cross_Street'] = label_cross_street.transform(X_test['Cross_Street'])'''

label_status  = LabelEncoder()
X_train['Status'].fillna("missing")
X_test['Status'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Status'] = label_status.fit_transform(X_train['Status'])
X_test['Status'] = label_status.transform(X_test['Status'])

label_victim_descent  = LabelEncoder()
X_train['Victim_Descent'].fillna("missing")
X_test['Victim_Descent'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Victim_Descent'] = label_victim_descent.fit_transform(X_train['Victim_Descent'])
X_test['Victim_Descent'] = label_victim_descent.transform(X_test['Victim_Descent'])

label_victim_sex  = LabelEncoder()
X_train['Victim_Sex'].fillna("missing")
X_test['Victim_Sex'].fillna("missing")
label_cross_street  = LabelEncoder()
X_train['Victim_Sex'] = label_status.fit_transform(X_train['Victim_Sex'])
X_test['Victim_Sex'] = label_status.transform(X_test['Victim_Sex'])

In [20]:
from sklearn.impute import SimpleImputer
value = 99999
# Specify the strategy as 'constant' and the fill_value as your desired static value
imputer = SimpleImputer(strategy='constant', fill_value=value)


In [21]:
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_test)

In [22]:
import xgboost as xg
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import Binarizer
#### Calculating Sample Weights
n_samples, n_labels = Y_train.shape
from collections import Counter
# Calculate sample weights for each label column
sample_weights = []

for label_idx,name in zip(range(n_labels), Y_train.columns):
    
    label_col = Y_train.values[:, label_idx]
    print(name, end = ' ')
    
    
    class_counts = Counter(label_col)
    print(class_counts, end = '  ')
    
    total_instances = sum(class_counts.values())
    
    class_weights = {cls: total_instances / (count * len(class_counts)) for cls, count in class_counts.items()}
    print(class_weights)
    
    label_weights = [class_weights[cls] for cls in label_col]
    sample_weights.append(label_weights)
    
sum(label_weights)/len(label_weights)
np.array(sample_weights).T.shape
# Combine the sample weights for all label columns
sample_weights = np.array(sample_weights).T
sample_weights = np.mean(sample_weights, axis=1)
sample_weights.shape

Crime_Category Counter({0: 11666, 1: 4767, 2: 1808, 3: 1355, 4: 225, 5: 179})  {0: 0.2857306131778959, 1: 0.6992518005733864, 5: 18.6219739292365, 2: 1.8436578171091444, 3: 2.4600246002460024, 4: 14.814814814814815}


(20000,)

In [23]:
# Convert Crime_Category to category type
Y_train['Crime_Category'] = Y_train['Crime_Category'].astype('category')
# Encode the categorical column to numeric values
Y_train['Crime_Category'] = Y_train['Crime_Category'].cat.codes
print(Y_train.dtypes)

Crime_Category    int8
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_train['Crime_Category'] = Y_train['Crime_Category'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_train['Crime_Category'] = Y_train['Crime_Category'].cat.codes


In [24]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [25]:
import xgboost as xgb 
from sklearn.multiclass import OneVsOneClassifier, OneVsOneClassifier

xgb_model = xgb.XGBClassifier(max_depth=7, n_estimators=200, min_child_weight=3, gamma=0.5, subsample=1.0, colsample_bytree=1.0, scale_pos_weight=1, learning_rate=0.1,objective='multi:softprob')
xgb_model.fit(X_train, Y_train)

Parameters: { "scale_pos_weight" } are not used.



In [None]:
preds = xgb_model.predict(X_test)


In [None]:
preds

In [None]:
preds = pd.DataFrame(preds, columns=['Crime_Category'])

In [None]:

replacement_dict = {0:  'Property Crimes',
                    1 : 'Violent Crimes',
                    2 : 'Crimes against Public Order',
                    3 : 'Fraud and White-Collar Crimes',
                    4 : 'Crimes against Persons',  # Add placeholders for missing values (if needed)
                    5 : 'Other Crimes'} 
preds['Replaced_Crime_Category'] = preds['Crime_Category'].replace(replacement_dict)



In [None]:
preds

In [None]:
#preds are my final output.
submission = pd.DataFrame({'ID':range(1,5001), 'Crime_Category': preds['Replaced_Crime_Category']})
submission.to_csv("DATA/submission.csv", index=False)

In [None]:
submission