In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#In this competition we will analyse a dataset storing criminal records of the state of San Francisco spanning 12 years.
#So let's get started!
#First we will import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
#Importing the train and test dataset
train=pd.read_csv('../input/sfcrime/train.csv/train.csv',parse_dates=['Dates'])
test=pd.read_csv('../input/sfcrime/test.csv/test.csv',parse_dates=['Dates'],index_col=['Id'])

In [None]:
#Let's check the dimensions of the input datasets to get an idea of their size.
train.shape

In [None]:
test.shape

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
#Checking the datatypes of each column in the train dataset
train.dtypes

In [None]:
#Now we will do a generic cleaning of the dataset,i.e. replacing nulls, removing duplicates and modifying outliers.

#Checking for null values
print(train.isnull().values.any())
#Checking for duplicated rows
print(train.duplicated(keep='first').value_counts())

In [None]:
#Removing duplicated rows from the train dataset
train.drop_duplicates(inplace=True)
print(train.shape)

In [None]:
#Now let's check out the columns X and Y, which seem to refer to longitude and latitude of crime scene respectively.
print(train[['X','Y']].describe())
print(test[['X','Y']].describe())

In [None]:
#Something weird! The X column seems fine, but the Y column (latitude) has a max value of 90 degree (North Pole!)
#These are clearly outliers and need to be dealt with.
#Let's check out how many of such rows are present.
#We choose the lower limit of Y as 38 bcoz upto 75% of Y's distribution, value is 37.78 (< 38).
print(train[['PdDistrict','X','Y']].loc[train['Y']>38])
print(test[['PdDistrict','X','Y']].loc[test['Y']>38])

In [None]:
#Instead of removing these outliers (which would mean loss of valuable data), we will first replace them with NaNs and then
#use SimpleImputer to replace these values with the average X,Y values for the corresponding district.
train.replace({'X':-120.5,'Y':90.0},np.nan,inplace=True)
test.replace({'X':-120.5,'Y':90.0},np.nan,inplace=True)
simp=SimpleImputer(strategy='mean')

for dist in train['PdDistrict'].unique():
    train.loc[train['PdDistrict']==dist,['X','Y']]=simp.fit_transform(train.loc[train['PdDistrict']==dist,['X','Y']])
    test.loc[test['PdDistrict']==dist,['X','Y']]=simp.transform(test.loc[test['PdDistrict']==dist,['X','Y']])

In [None]:
#Let's have a look at the timespan of this dataset.
train['Dates'].describe()

In [None]:
#Checking the description of important categorical variables like Category, PdDistrict etc.
#will give us some useful insights related to the data.
train.describe(include='object')

In [None]:
#The previous table gave us the info about top categories of crime, day of the week with most crimes, 
#district with maximum crimes and many more...
#Let's analyze them one by one.

#First we will analyze crime by district with the help of a bar plot.
distcrime=train['PdDistrict'].value_counts().sort_values(ascending=False)

plt.figure(figsize=(12,12))
with sns.axes_style('darkgrid'):
    ax=sns.barplot(distcrime.values/(distcrime.values.sum())*100.0,distcrime.index,orient='h',palette='Blues_r')

plt.title('Crime percentage by District')    
plt.xlabel('Crime Percentage')
plt.ylabel('District')

In [None]:
#The previous plot clearly shows that Southern district has the worst record in overall crime incidents.
#Now let's analyze crime by category.
catcrime=train['Category'].value_counts().sort_values(ascending=False)

plt.figure(figsize=(12,12))
with sns.axes_style('darkgrid'):
    ax=sns.barplot(catcrime.values/(catcrime.values.sum())*100.0, catcrime.index,orient='h',palette='Greens_r')
    
plt.xlabel('Crime Percentage')    
plt.ylabel('Category')
plt.title('Crime distribution by category')    

In [None]:
#The previous plot clearly shows that Larceny/theft is the most common category of crime.
#Now let's see which district records the most cases of theft.

df=train.groupby('Category').get_group('LARCENY/THEFT')
theft=df.sort_values('PdDistrict')['PdDistrict'].value_counts()

plt.figure(figsize=(12,12))
with sns.axes_style('darkgrid'):
    ax=sns.barplot(theft.values/(theft.values.sum())*100.0,theft.index,orient='h',palette='Reds_r')
    
plt.xlabel('Larceny/Theft Percentage')    
plt.ylabel('District')
plt.title('Theft percentage by district')

In [None]:
#Hence Southern is the district to look out for if you don't want your pockets picked :P
#Now, finding crimes sorted by days of the week...

crimeday=train['DayOfWeek'].value_counts()

with sns.axes_style('darkgrid'):
    ax=sns.barplot(crimeday.values/(crimeday.values.sum())*100.0,crimeday.index,orient='h',palette='Greens_r')
    
plt.xlabel('Crime Percentage')    
plt.ylabel('Day of the Week')
plt.title('Crime percentage by day of the week')    

In [None]:
#Now that we have done a decent amount of data visualization, 
#we will do some feature engineering to make the models work.
def feature_engineering(df):
    df['n_days']=(df['Dates'] - df['Dates'].min()).apply(lambda x: x.days)
    df['Day']=df['Dates'].dt.day
    df['Month']=df['Dates'].dt.month
    df['Year']=df['Dates'].dt.year
    df['Hour']=df['Dates'].dt.hour
    df['Minute']=df['Dates'].dt.minute
    df['Block']=df['Address'].str.contains('Block',case=False).apply(lambda x:0 if x==False else 1)
    df['Street']=df['Address'].str.contains('St', case=False).apply(lambda x:0 if x == False else 1)
    df['X-Y']=df['X']-df['Y']
    df['X+Y']=df['X']+df['Y']
    
    
feature_engineering(train)    
feature_engineering(test)

In [None]:
#We will use the LabelEncoder to replace categorical variables with numerical values.
le=LabelEncoder()
le1=LabelEncoder()

train['PdDistrict']=le.fit_transform(train['PdDistrict'])
test['PdDistrict']=le.transform(test['PdDistrict'])

train['Category']=le1.fit_transform(train['Category'])
y=train['Category']

train['DayOfWeek']=le.fit_transform(train['DayOfWeek'])
test['DayOfWeek']=le.transform(test['DayOfWeek'])

train.drop(columns=['Category','Dates','Descript','Resolution','Address'],inplace=True)
test.drop(columns=['Dates','Address'],inplace=True)

In [None]:
#Finally, we will train the LGBMClassifier with train dataset and use it to 
#make predictions for the test dataset.
lgb=LGBMClassifier(objective='multiclass',num_class=38,max_bin=465,max_delta_step=0.9,
                   learning_rate=0.4,num_leaves=40,n_estimators=100)
lgb.fit(train,y,categorical_feature=['DayOfWeek','PdDistrict'],eval_metric='logloss')
prediction=lgb.predict_proba(test)
subm=pd.DataFrame(prediction, columns=le1.inverse_transform(np.linspace(0, 38, 39, dtype='int16')), index=test.index)
subm.to_csv('submission.csv',index_label='Id')