In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install squarify

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('/kaggle/input/sf-crime/train.csv.zip', parse_dates= ['Dates'])
test = pd.read_csv('/kaggle/input/sf-crime/test.csv.zip', parse_dates= ['Dates'], index_col = 'Id')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
plt.rcParams['figure.figsize'] = (20, 9)
plt.style.use('dark_background')

sns.countplot(train['Category'], palette = 'gnuplot')

plt.title('Major Crimes in Sanfrancisco', fontweight = 30, fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# plotting a tree map

y = train['Category'].value_counts().head(25)
    
plt.rcParams['figure.figsize'] = (15, 15)
plt.style.use('fivethirtyeight')

color = plt.cm.magma(np.linspace(0, 1, 15))
squarify.plot(sizes = y.values, label = y.index, alpha=.8, color = color)
plt.title('Tree Map for Top 25 Crimes', fontsize = 20)

plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud

plt.rcParams['figure.figsize'] = (15, 15)
plt.style.use('fast')

wc = WordCloud(background_color = 'orange', width = 1500, height = 1500).generate(str(train['Descript']))
plt.title('Description of the Crime', fontsize = 20)

plt.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20, 9)
plt.style.use('seaborn')

color = plt.cm.spring(np.linspace(0, 1, 15))
train['PdDistrict'].value_counts().plot.bar(color = color, figsize = (15, 10))

plt.title('District with Most Crime',fontsize = 30)

plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20, 9)
plt.style.use('seaborn')
​
color = plt.cm.ocean(np.linspace(0, 1, 15))
train['Address'].value_counts().head(15).plot.bar(color = color, figsize = (15, 10))
​
plt.title('Top 15 Regions in Crime',fontsize = 20)
​
plt.xticks(rotation = 90)
plt.show()

In [None]:

plt.style.use('seaborn')


train['DayOfWeek'].value_counts().head(15).plot.pie(figsize = (15, 8), explode = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1))

plt.title('Crime count on each day',fontsize = 20)

plt.xticks(rotation = 90)
plt.show()

In [None]:
y = list(test)
t_data= train[y]
join = pd.concat([t_data, test])
merge = join.copy()


In [None]:
merge.head()

In [None]:
t= train['Category']
merge.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
LB = LabelEncoder()
tar = LB.fit_transform(t)
print(LB.classes_)

In [None]:
date = pd.to_datetime(join['Dates'])
merge['Date'] = date.dt.date
merge['Year'] = date.dt.year
merge['Month'] = date.dt.month
merge['Day'] = date.dt.day
merge['Hour'] = date.dt.hour
merge.drop('Dates', axis = 1, inplace = True)

In [None]:
date = merge.groupby('Date').count().iloc[:, 0]

In [None]:
sns.kdeplot(data = date, shade = True)
plt.axvline(x = date.median(), ymax = 0.95, linestyle = '-')
plt.annotate('Median' + str(date.median()), xy = (date.median(), 0.005))

In [None]:
lb= LabelEncoder()
merge['PdDis'] = lb.fit_transform(merge['PdDistrict'])
sns.countplot(merge['PdDis'])

In [None]:
merge[['PdDistrict', 'PdDis']].head(10)

In [None]:
merge.drop('PdDistrict', axis = 1, inplace = True)

In [None]:
merge['DayWeek'] = lb.fit_transform(merge['DayOfWeek'])
merge[['DayOfWeek', 'DayWeek']].head(10)

In [None]:
merge.drop('DayOfWeek', axis = 1, inplace = True)

In [None]:
merge['Block'] = merge['Address'].str.contains('block', case = False)
merge['ST'] = merge['Address'].str.contains('ST', case = False)
merge.drop('Address', axis = 1, inplace = True)

In [None]:
print(merge['X'].min(), merge['X'].max())
print(merge['Y'].min(), merge['Y'].max())

In [None]:
medX= merge[merge['X'] < -120.5]['X'].median()
medY = merge[merge['Y'] < 90]['Y'].median()
merge.loc[merge['X'] >= -120.5, 'X'] = medX
merge.loc[merge['Y'] >= 90, 'Y'] = medY

In [None]:
merge['X+Y'] = merge['X'] + merge['Y']
merge['X-Y'] = merge['X'] - merge['Y']

In [None]:
merge.drop('Date', axis = 1, inplace = True)

In [None]:
data_train = merge[:train.shape[0]]
data_test = merge[train.shape[0]:]

In [None]:
import lightgbm as lg
trains = lg.Dataset(data_train, label = tar, categorical_feature=['PdDis', 'DayWeek'])
params = {
    'boosting':'gbdt',
    'objective':'multiclass',
    'num_class':39,
    'max_delta_step':0.9,
    'min_data_in_leaf': 20,
    'learning_rate': 0.4,
    'max_bin': 480,
    'num_leaves': 45,
    'verbose' : 1
}
bst = lg.train(params, trains, 120)

In [None]:
prediction = bst.predict(data_test)

In [None]:
result1 = pd.DataFrame(prediction, columns = LB.inverse_transform(np.linspace(0, 38, 39, dtype='int16')),index=data_test.index) 
result1.head()
result1.to_csv('submission', index_label = 'Id')