In [None]:
#dataset link for this project
#---https://drive.google.com/drive/folders/1cGIrzjs_aIfucCFGwGPLA4tXUAlR8b8H?usp=sharing

# Importing necessary libraries

In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split


from sklearn.metrics import recall_score
from imblearn.over_sampling import ADASYN
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Reading csv file, filling nan values and dropping unnecessary columns

In [None]:
# reading training data
data=pd.read_csv('Train.csv')

data.head()

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15,MULTIPLE_OFFENSE
0,CR_102659,04-JUL-04,0,36,34,2,1,5,6,1,6,1,174,1.0,92,29,36,0
1,CR_189752,18-JUL-17,1,37,37,0,0,11,17,1,6,1,236,1.0,103,142,34,1
2,CR_184637,15-MAR-17,0,3,2,3,5,1,0,2,3,1,174,1.0,110,93,34,1
3,CR_139071,13-FEB-09,0,33,32,2,1,7,1,1,6,1,249,1.0,72,29,34,1
4,CR_109335,13-APR-05,0,33,32,2,1,8,3,0,5,1,174,0.0,112,29,43,1


In [None]:
# checking null values
data.isnull().sum()

INCIDENT_ID           0
DATE                  0
X_1                   0
X_2                   0
X_3                   0
X_4                   0
X_5                   0
X_6                   0
X_7                   0
X_8                   0
X_9                   0
X_10                  0
X_11                  0
X_12                182
X_13                  0
X_14                  0
X_15                  0
MULTIPLE_OFFENSE      0
dtype: int64

In [None]:
#dropping columns and filling nan values with median values.
# Assumption:
#1. incident_id and date columns are not needed to predict 
#2. filling nan values with median to avoid effect of outliers if any present

data=data.drop(columns=['INCIDENT_ID','DATE'])

data['X_12']=data['X_12'].fillna(data['X_12'].median())

In [None]:
# poping target column
y=data.pop('MULTIPLE_OFFENSE')

In [None]:
ada = ADASYN(random_state=42)
data, y = ada.fit_resample(data, y)
print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({0: 22897, 1: 22788})


In [None]:
# splitting the training data in test and validation
xtrain,xtest,ytrain,ytest=train_test_split(data,y,test_size=0.2,random_state=42)

# Fitting classifier and predicting score over validation set

In [None]:
import lightgbm 

train_data = lightgbm.Dataset(xtrain, label=ytrain)
test_data = lightgbm.Dataset(xtest, label=ytest)

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

[1]	valid_0's auc: 0.982837
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.999367
[3]	valid_0's auc: 0.998959
[4]	valid_0's auc: 0.999518
[5]	valid_0's auc: 0.999675
[6]	valid_0's auc: 0.999698
[7]	valid_0's auc: 0.999765
[8]	valid_0's auc: 0.999872
[9]	valid_0's auc: 0.999844
[10]	valid_0's auc: 0.999831
[11]	valid_0's auc: 0.999852
[12]	valid_0's auc: 0.999834
[13]	valid_0's auc: 0.999811
[14]	valid_0's auc: 0.999832
[15]	valid_0's auc: 0.999814
[16]	valid_0's auc: 0.999831
[17]	valid_0's auc: 0.999825
[18]	valid_0's auc: 0.99985
[19]	valid_0's auc: 0.999862
[20]	valid_0's auc: 0.999878
[21]	valid_0's auc: 0.999873
[22]	valid_0's auc: 0.999873
[23]	valid_0's auc: 0.999875
[24]	valid_0's auc: 0.999874
[25]	valid_0's auc: 0.999874
[26]	valid_0's auc: 0.999875
[27]	valid_0's auc: 0.999878
[28]	valid_0's auc: 0.999878
[29]	valid_0's auc: 0.999885
[30]	valid_0's auc: 0.999878
[31]	valid_0's auc: 0.999891
[32]	valid_0's auc: 0.999895
[33]	valid_0's auc: 

[302]	valid_0's auc: 0.999997
[303]	valid_0's auc: 0.999997
[304]	valid_0's auc: 0.999997
[305]	valid_0's auc: 0.999997
[306]	valid_0's auc: 0.999997
[307]	valid_0's auc: 0.999997
[308]	valid_0's auc: 0.999997
[309]	valid_0's auc: 0.999997
[310]	valid_0's auc: 0.999997
[311]	valid_0's auc: 0.999997
[312]	valid_0's auc: 0.999998
[313]	valid_0's auc: 0.999997
[314]	valid_0's auc: 0.999998
[315]	valid_0's auc: 0.999998
[316]	valid_0's auc: 0.999998
[317]	valid_0's auc: 0.999997
[318]	valid_0's auc: 0.999997
[319]	valid_0's auc: 0.999997
[320]	valid_0's auc: 0.999997
[321]	valid_0's auc: 0.999998
[322]	valid_0's auc: 0.999998
[323]	valid_0's auc: 0.999998
[324]	valid_0's auc: 0.999998
[325]	valid_0's auc: 0.999998
[326]	valid_0's auc: 0.999998
[327]	valid_0's auc: 0.999998
[328]	valid_0's auc: 0.999998
[329]	valid_0's auc: 0.999998
[330]	valid_0's auc: 0.999998
[331]	valid_0's auc: 0.999998
[332]	valid_0's auc: 0.999998
[333]	valid_0's auc: 0.999998
[334]	valid_0's auc: 0.999998
[335]	vali

# Reading test data and  prediction over test data

In [None]:
test=pd.read_csv('Test.csv')

In [None]:
test.isnull().sum()

INCIDENT_ID      0
DATE             0
X_1              0
X_2              0
X_3              0
X_4              0
X_5              0
X_6              0
X_7              0
X_8              0
X_9              0
X_10             0
X_11             0
X_12           127
X_13             0
X_14             0
X_15             0
dtype: int64

In [None]:
test=test.drop(columns=['INCIDENT_ID','DATE'])

test['X_12']=test['X_12'].fillna(test['X_12'].median())

In [None]:
ypredictions=model.predict(test)

In [None]:
for i in range(len(ypredictions)):
    if ypredictions[i]>0.5:
        ypredictions[i]=1
    else:
        ypredictions[i]=0
    


In [None]:
sub=pd.read_csv('test.csv')

sub['MULTIPLE_OFFENSE']=ypredictions

SUB=sub[['INCIDENT_ID','MULTIPLE_OFFENSE']]

SUB=SUB.set_index('INCIDENT_ID',drop=True)

sub_csv=SUB.to_csv('sol.csv')