In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# The Task

* Dates - timestamp of the crime incident
* DayOfWeek - the day of the week
* PdDistrict - name of the Police Department District
* Resolution - how the crime incident was resolved (only in train.csv)
* Address - the approximate street address of the crime incident
* X - Longitude
* Y - Latitude

#### Target

* Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
* Descript - detailed description of the crime incident (only in train.csv)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_data = pd.read_csv("/kaggle/input/sf-crime/train.csv")
test_data = pd.read_csv("/kaggle/input/sf-crime/test.csv")

In [None]:
train_data.head()

### Missing Values

In [None]:
print (train_data.isnull().sum())
print (test_data.isnull().sum())

### Type of data

In [None]:
train_data.info()

### Features Selection and Engineering

* Dates transform and columns creation
* Drop not used columns
* Create Dummies for categorical features
* From the ‘Dates’ field, we extracted the Day, the Month, the Year, the Hour, the Minute, the Weekday, and the number of days since the first day in the data.
* From the ‘Address’ field we extracted if the incident has taken place in a crossroad or on a building block.

In [None]:
train_data = train_data.drop(["Descript", "Resolution"], axis = 1)

In [None]:
def transformDataset(dataset):
    dataset['Dates'] = pd.to_datetime(dataset['Dates'])
    
    dataset['Date'] = dataset['Dates'].dt.date
    
    dataset['n_days'] = (dataset['Date'] - dataset['Date'].min()).apply(lambda x: x.days)
    
    dataset['Year'] = dataset['Dates'].dt.year
    dataset['DayOfWeek'] = dataset['Dates'].dt.dayofweek # OVERWRITE
    dataset['WeekOfYear'] = dataset['Dates'].dt.weekofyear
    dataset['Month'] = dataset['Dates'].dt.month
    
    dataset['Hour'] = dataset['Dates'].dt.hour
    
    dataset['Block'] = dataset['Address'].str.contains('block', case=False)
    dataset['Block'] = dataset['Block'].map(lambda x: 1 if  x == True else 0)

    
    dataset = dataset.drop('Dates', 1)
    dataset = dataset.drop('Date', 1)
    dataset = dataset.drop('Address', 1)
    
    dataset = pd.get_dummies(data=dataset, columns=[ 'PdDistrict'], drop_first = True)
    return dataset
    

In [None]:
train_data = transformDataset(train_data)

In [None]:
test_data  = transformDataset(test_data)

# Outliers

In [None]:
train_data.head()

In [None]:
sns.pairplot(train_data[["X", "Y"]])

In [None]:
sns.boxplot(train_data[["Y"]])

In [None]:
train_data = train_data[train_data["Y"] < 80]
sns.distplot(train_data[["X"]])

## Target

In [None]:
fig, ax = plt.subplots(figsize=(9.2, 10))
plt.barh(train_data["Category"].unique(),train_data["Category"].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_data["Category"] = le.fit_transform(train_data["Category"])


# X and y

In [None]:
X = train_data.drop("Category",axis=1).values
y = train_data["Category"].values

# Training Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

# Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

# Prediction and Evaluation

* a prediction on a new data point is made by checking wich region of the partition of the feature space the point lies in and then predicting the majority target (or the single target of pure leaves)


In [None]:
predictions = dtree.predict(X_test)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix

cm = confusion_matrix(y_test,predictions)
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(cm, annot=False, ax = ax); #annot=True to annotate cells
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

In [None]:
print (classification_report(y_test,predictions))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=40,min_samples_split=100 )
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)
print (classification_report(y_test,rfc_pred))

### Feature importances

In [None]:
n_features = X.shape[1]
plt.barh(range(n_features),rfc.feature_importances_)
plt.yticks(np.arange(n_features),train_data.columns[1:])

# Submission

In [None]:
keys = le.classes_
values = le.transform(le.classes_)
keys

In [None]:
dictionary = dict(zip(keys, values))
print(dictionary)

In [None]:
test_data.head()

In [None]:
test_data = test_data.drop('Id', 1)

In [None]:
y_pred_proba = rfc.predict_proba(test_data)
y_pred_proba

* for each prediction there is a vector of 39 probabilities

In [None]:
result = pd.DataFrame(y_pred_proba, columns=keys)
result.head()

In [None]:
result.to_csv(path_or_buf="rfc_predict_4.csv",index=True, index_label = 'Id')