In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from collections import defaultdict
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pylab
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data=train_data=pd.read_csv('/kaggle/input/sf-crime/train.csv.zip')
train_data.head()

In [None]:
test_data=test_data=pd.read_csv('/kaggle/input/sf-crime/test.csv.zip')
test_data.head()

In [None]:
train_data.info()
train_data.shape

In [None]:
test_data.info()
test_data.shape

In [None]:
train_data.describe

In [None]:
d = defaultdict(LabelEncoder)
sf_encode = train_data.apply(lambda x: d[x.name].fit_transform(x))
sf_encode = sf_encode.drop(['X', 'Y'], axis=1)

corrmat = sf_encode.corr()
f, ax = plt.subplots(figsize=(12, 12))
plot2 =sns.heatmap(corrmat, vmax=.8);
plt.xticks(rotation=90)
plt.yticks(rotation=45)
plot2.axes.set_title('Correlation Heat Map')
plt.show()

In [None]:
d = defaultdict(LabelEncoder)
sf_encode = test_data.apply(lambda x: d[x.name].fit_transform(x))
sf_encode = sf_encode.drop(['X', 'Y'], axis=1)

corrmat = sf_encode.corr()
f, ax = plt.subplots(figsize=(12, 12))
plot2 =sns.heatmap(corrmat, vmax=.8);
plt.xticks(rotation=90)
plt.yticks(rotation=45)
plot2.axes.set_title('Correlation Heat Map')
plt.show()

In [None]:
train_data.groupby("Category")["Category"].count().sort_values(ascending=False)
number_of_crimes = train_data.Category.value_counts()
_n_crime_plot = sns.barplot(x=number_of_crimes.index,y=number_of_crimes)
_n_crime_plot.set_xticklabels(number_of_crimes.index,rotation=90)

In [None]:
train_data.groupby("PdDistrict")["PdDistrict"].count().sort_values(ascending=False)
most_dangerous_districts = train_data.PdDistrict.value_counts()
_n_crime_plot = sns.barplot(x=most_dangerous_districts.index,y=most_dangerous_districts)
_n_crime_plot.set_xticklabels(most_dangerous_districts.index,rotation=90)

In [None]:
test_data.groupby("PdDistrict")["PdDistrict"].count().sort_values(ascending=False)
most_dangerous_districts = test_data.PdDistrict.value_counts()
_n_crime_plot = sns.barplot(x=most_dangerous_districts.index,y=most_dangerous_districts)
_n_crime_plot.set_xticklabels(most_dangerous_districts.index,rotation=90)

In [None]:
pt = pd.pivot_table(train_data,index="PdDistrict",columns="Category",aggfunc=len,fill_value=0)["Dates"]
_ = pt.loc[most_dangerous_districts.index,number_of_crimes.index]
ax = sns.heatmap(_)
ax.set_title("Number of Crimes per District")

In [None]:
train_data['Dates'] = train_data['Dates'].astype('datetime64[ns]')
train_data['Hour']= train_data.Dates.dt.hour
test_data['Dates'] = test_data['Dates'].astype('datetime64[ns]')


In [None]:
pylab.rcParams['figure.figsize'] = (14.0, 8.0)

larceny = train_data[train_data['Category'] == "LARCENY/THEFT"]
assault = train_data[train_data['Category'] == "ASSAULT"]
drug = train_data[train_data['Category'] == "DRUG/NARCOTIC"]
vehicle = train_data[train_data['Category'] == "VEHICLE THEFT"]
vandalism = train_data[train_data['Category'] == "VANDALISM"]
burglary = train_data[train_data['Category'] == "BURGLARY"]

with plt.style.context('fivethirtyeight'):
    ax1 = plt.subplot2grid((3,3), (0,0), colspan=3)
    ax1.plot(train_data.groupby('Hour').size(), 'ro-')
    ax1.set_title ('All crimes')
    start, end = ax1.get_xlim()
    ax1.xaxis.set_ticks(np.arange(start, end, 1))
    
    ax2 = plt.subplot2grid((3,3), (1, 0))
    ax2.plot(larceny.groupby('Hour').size(), 'o-')
    ax2.set_title ('Larceny/Theft')
    
    ax3 = plt.subplot2grid((3,3), (1, 1))
    ax3.plot(assault.groupby('Hour').size(), 'o-')
    ax3.set_title ('Assault')
    
    ax4 = plt.subplot2grid((3,3), (1, 2))
    ax4.plot(drug.groupby('Hour').size(), 'o-')
    ax4.set_title ('Drug/Narcotic')
    
    ax5 = plt.subplot2grid((3,3), (2, 0))
    ax5.plot(vehicle.groupby('Hour').size(), 'o-')
    ax5.set_title ('Vehicle')
    
    ax6 = plt.subplot2grid((3,3), (2, 1))
    ax6.plot(vandalism.groupby('Hour').size(), 'o-')
    ax6.set_title ('Vandalism')
    
    ax7 = plt.subplot2grid((3,3), (2, 2))
    ax7.plot(burglary.groupby('Hour').size(), 'o-')
    ax7.set_title ('Burglary')
  
    pylab.gcf().text(0.5, 1.03, 
                     'San Franciso Crime Occurence by Hour',
                     horizontalalignment='center',
                     verticalalignment='top', 
                     fontsize = 28)
    
plt.tight_layout(2)
plt.show()

In [None]:
train_data['Dates'] = train_data['Dates'].astype('datetime64[ns]')
train_data['Month']= train_data.Dates.dt.month
train_data['Year']= train_data.Dates.dt.year
test_data['Dates'] = test_data['Dates'].astype('datetime64[ns]')
test_data['Month']= test_data.Dates.dt.month
test_data['Year']= test_data.Dates.dt.year

In [None]:
pylab.rcParams['figure.figsize'] = (16.0, 5.0)
yearMonth = train_data.groupby(['Year','Month']).size()
ax = yearMonth.plot(lw=2)
plt.title('San Franciso Crimes Trend by Month&Year', fontsize=24)
plt.show()

In [None]:
pylab.rcParams['figure.figsize'] = (16.0, 5.0)
yearMonth = test_data.groupby(['Year','Month']).size()
ax = yearMonth.plot(lw=2)
plt.title('San Franciso Crimes Trend by Month&Year', fontsize=24)
plt.show()

In [None]:
Days = train_data.DayOfWeek.value_counts()
Day_plot = sns.barplot(x=Days.index,y=Days)
Day_plot.set_xticklabels(Days.index,rotation=90)

In [None]:
test_Days = test_data.DayOfWeek.value_counts()
test_Day_plot = sns.barplot(x=test_Days.index,y=test_Days)
test_Day_plot.set_xticklabels(test_Days.index,rotation=90)

In [None]:
train_data.describe(include='all')

In [None]:
#checking that there are no nulls
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
#checking for unique values per column
train_data.Category.unique()

In [None]:
train_data.PdDistrict.unique()


In [None]:
train_data.Resolution.unique()


In [None]:
train_data.Address.unique()


In [None]:
train_data.X.unique()


In [None]:
train_data.Y.unique()


In [None]:
train_data.Hour.unique()


In [None]:
train_data.Month.unique()


In [None]:
train_data.Year.unique()
#end of training columns,wont drop any

In [None]:
train_data.Resolution.isna()

In [None]:
sns.boxplot(data=train_data, x='Hour') #no outliers

In [None]:
sns.boxplot(data=train_data, x='Year') #no outliers

In [None]:
sns.boxplot(data=train_data, x='Month')



In [None]:
sns.boxplot(data=train_data, x='X')

In [None]:
sns.boxplot(data=train_data, x='Y')

In [None]:
#function for outliers
def fix_outliers(df_, colName):
  q1 = df_[colName].quantile(0.25)
  q3 = df_[colName].quantile(0.75)
  range = q3-q1
  whisker_upper = q3+1.5*range
  whisker_lower = q1-1.5*range
  df_[colName] = np.where(df_[colName]>whisker_upper, whisker_upper, np.where(df_[colName]<whisker_lower, whisker_lower, df_[colName]))
  return df_

In [None]:
train_data= fix_outliers(train_data, 'X')
train_data= fix_outliers(train_data, 'Y')

In [None]:
sns.boxplot(data=train_data, x='Y') #outliers fixed

In [None]:
train_data.drop_duplicates()
train_data.shape

In [None]:
#dropping undeeded columns
train_data.drop(['Resolution', 'Descript', 'Address', 'Dates','DayOfWeek'], inplace=True, axis=1)

In [None]:
sns.boxplot(data=train_data, x='X') #outliers fixed

In [None]:
#no null values in test data and there is no need to check for unique values
test_data.drop(['Id', 'Address', 'Dates','DayOfWeek'], inplace=True, axis=1) #dropping useless columns

In [None]:
sns.boxplot(data=test_data, x='X')

In [None]:
sns.boxplot(data=test_data, x='Y')

In [None]:
test_data= fix_outliers(test_data, 'X')
test_data= fix_outliers(test_data, 'Y')

In [None]:
sns.boxplot(data=test_data, x='X')

In [None]:
sns.boxplot(data=test_data, x='Y')

In [None]:
test_data.head

In [None]:
le1 = LabelEncoder()
train_data['PdDistrict'] = le1.fit_transform(train_data['PdDistrict'])
test_data['PdDistrict'] = le1.transform(test_data['PdDistrict'])

le2 = LabelEncoder()
X = train_data.drop(columns=['Category'])
y= le2.fit_transform(train_data['Category'])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)

In [None]:
print (classification_report(y_test,predictions))

In [None]:
rfc = RandomForestClassifier(n_estimators=40,min_samples_split=100 )
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)
print ("Train Accuracy: ", accuracy_score(y_train, rfc.predict(X_train)))
print ("Test Accuracy: ", accuracy_score(y_test, rfc_pred))

In [None]:
print (classification_report(y_test,rfc_pred))

In [None]:
#model tuning
rfc = RandomForestClassifier(n_estimators=60,min_samples_split=80 )
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print ("Train Accuracy: ", accuracy_score(y_train, rfc.predict(X_train)))
print ("Test Accuracy: ", accuracy_score(y_test, rfc_pred))

In [None]:
cm = confusion_matrix(y_test,predictions)
fig, ax = plt.subplots(figsize=(10, 7))
sns.heatmap(cm, annot=False, ax = ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix');

In [None]:
svm_clf = SVC(C=0.1, kernel='linear')
svm_clf = svm_clf.fit(X_train[0:3000], y_train[0:3000])
y_pred = svm_clf.predict(X_test)
accuracy_score(y_pred, y_test)

In [None]:
#model tuning
svm_clf_2 = SVC(C=1000, kernel='rbf')
svm_clf_2 = svm_clf_2.fit(X_train[0:3000], y_train[0:3000])
y_pred_2 = svm_clf_2.predict(X_test)
accuracy_score(y_pred_2, y_test)