In [1]:
import pandas as pd
import numpy as np

# modeling
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

# validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import classification_report


In [2]:
def describe_categorical(X):
    print(X[X.columns[X.dtypes == 'object']].describe())

def get_original_datasets(idx):
    global combined
    
    train0 = pd.read_csv('data/train.csv')
    targets = train0.Category
    train = combined.head(idx)
    test = combined.iloc[idx:]
    
    return train, test, targets

def combined_dataset():
    train = pd.read_csv("data/train.csv")
    test = pd.read_csv("data/test.csv")
    targets = train.Category
    train.drop('Category', 1, inplace=True)
    combined = train.append(test)
    combined.reset_index(inplace=True)
    combined.drop('index', inplace=True, axis=1)
    
    return combined, train.shape[0], targets

combined, idx, targets = combined_dataset()
le = LabelEncoder()
le.fit_transform(targets)

array([37, 21, 21, ..., 16, 35, 12])

In [3]:
combined['Dates'] = pd.to_datetime(combined['Dates'])
combined['Year'] = combined['Dates'].apply(lambda x: x.year)


In [4]:
combined['Hour'] = combined['Dates'].map(lambda x: x.hour)
combined['Month'] = combined['Dates'].map(lambda x: x.month)


In [5]:
import re

def extractStreetType(x):
    types = r"AV|BL|CR|CT|DR|EX|HWY|HY|LN|PL|PZ|RD|ST|TR|WY|WAY"
    return re.search(types, x, flags=re.IGNORECASE).group()
    
combined['StreetType'] = combined.Address.apply(extractStreetType)

In [6]:
def day_of_week():
    global combined
    dayofweek_dummies = pd.get_dummies(combined['DayOfWeek'],prefix='DayOfWeek')
    combined = pd.concat([combined,dayofweek_dummies],axis=1)
    combined.drop('DayOfWeek', inplace=True, axis=1)
    
day_of_week()

In [7]:
def street_type():
    global combined
    streettype_dummies = pd.get_dummies(combined['StreetType'],prefix='StreetType')
    combined = pd.concat([combined,streettype_dummies],axis=1)
    combined.drop('StreetType', inplace=True, axis=1)

street_type()

In [8]:
def pd_district():
    global combined
    pd_district_dummies = pd.get_dummies(combined['PdDistrict'],prefix='PdDistrict')
    combined = pd.concat([combined,pd_district_dummies],axis=1)
    combined.drop('PdDistrict', inplace=True, axis=1)
pd_district()

In [9]:
def street_no():
    global combined
    combined['StreetNo'] = combined['Address'].apply(lambda x: int(x.split(' ', 1)[0]) if x.split(' ', 1)[0].isdigit() else 0)

street_no()

In [10]:
def process_address():
    global combined
    combined['Address'] = combined['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)

process_address()

In [11]:
le = LabelEncoder()
features_to_encode = ['Dates', 'Address']

for c in features_to_encode:
    combined[c] = le.fit_transform(combined[c])

In [12]:
scaler = StandardScaler()
features_to_scale = ['Address', 'Dates', 'X', 'Y', 'Year', 'Hour', 'Month', 'StreetNo']
for c in features_to_scale:
    combined[c] = scaler.fit_transform(combined[c].reshape(-1, 1))




In [13]:
combined.drop(['Id','Descript', 'Resolution'], inplace=True, axis=1)

In [14]:
types = combined.columns.to_series().groupby(combined.dtypes).groups
for k,v in types.items():
    print(k, v)

float64 ['Address', 'Dates', 'X', 'Y', 'Year', 'Hour', 'Month', 'DayOfWeek_Friday', 'DayOfWeek_Monday', 'DayOfWeek_Saturday', 'DayOfWeek_Sunday', 'DayOfWeek_Thursday', 'DayOfWeek_Tuesday', 'DayOfWeek_Wednesday', 'StreetType_AV', 'StreetType_BL', 'StreetType_Bl', 'StreetType_CR', 'StreetType_CT', 'StreetType_DR', 'StreetType_EX', 'StreetType_HWY', 'StreetType_HY', 'StreetType_LN', 'StreetType_PL', 'StreetType_PZ', 'StreetType_RD', 'StreetType_ST', 'StreetType_TR', 'StreetType_WAY', 'StreetType_WY', 'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN', 'StreetNo']


In [15]:
combined.isnull().sum()

Address                  0
Dates                    0
X                        0
Y                        0
Year                     0
Hour                     0
Month                    0
DayOfWeek_Friday         0
DayOfWeek_Monday         0
DayOfWeek_Saturday       0
DayOfWeek_Sunday         0
DayOfWeek_Thursday       0
DayOfWeek_Tuesday        0
DayOfWeek_Wednesday      0
StreetType_AV            0
StreetType_BL            0
StreetType_Bl            0
StreetType_CR            0
StreetType_CT            0
StreetType_DR            0
StreetType_EX            0
StreetType_HWY           0
StreetType_HY            0
StreetType_LN            0
StreetType_PL            0
StreetType_PZ            0
StreetType_RD            0
StreetType_ST            0
StreetType_TR            0
StreetType_WAY           0
StreetType_WY            0
PdDistrict_BAYVIEW       0
PdDistrict_CENTRAL       0
PdDistrict_INGLESIDE     0
PdDistrict_MISSION       0
PdDistrict_NORTHERN      0
PdDistrict_PARK          0
P

In [16]:
train, test, targets = get_original_datasets(idx)

In [18]:
from sklearn.feature_selection import SelectFromModel

select_model = True
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(train, targets)

if select_model:
    sfm = SelectFromModel(clf, prefit=True)
    train = sfm.transform(train)
    test = sfm.transform(test)
    print(le.classes_[sfm.get_support()]) 

NameError: name 'columns' is not defined

In [19]:
X_train, X_test, y_train, y_test = train_test_split(train, targets, test_size=0.3, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(614634, 8)
(263415, 8)
(614634,)
(263415,)


In [20]:
model = RandomForestClassifier(n_estimators=10, max_depth=50, max_features=0.3, min_samples_leaf=5, random_state=42)
model.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features=0.3, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [21]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [22]:
from sklearn.metrics import log_loss
clf_probs = model.predict_proba(X_test)
score = log_loss(y_test, clf_probs)
print(" * uncalibrated classifier trained on 800 datapoints: %.3f " % score)

 * uncalibrated classifier trained on 800 datapoints: 4.459 


In [23]:
print(classification_report(y_test, y_test_pred))

                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       451
                    ASSAULT       0.21      0.21      0.21     23137
                 BAD CHECKS       0.00      0.00      0.00       118
                    BRIBERY       0.00      0.00      0.00        84
                   BURGLARY       0.17      0.10      0.13     10890
         DISORDERLY CONDUCT       0.11      0.03      0.05      1226
DRIVING UNDER THE INFLUENCE       0.12      0.00      0.01       701
              DRUG/NARCOTIC       0.36      0.41      0.38     16338
                DRUNKENNESS       0.02      0.00      0.00      1260
               EMBEZZLEMENT       0.00      0.00      0.00       340
                  EXTORTION       0.00      0.00      0.00        82
            FAMILY OFFENSES       0.22      0.01      0.03       144
     FORGERY/COUNTERFEITING       0.23      0.06      0.09      3161
                      FRAUD      

  'precision', 'predicted', average, warn_for)


In [25]:
import csv

predicted = np.array(model.predict_proba(test))

labels = ['Id']
for i in model.classes_:
    labels.append(i)
with open('submissions/sn_random_forest_submission.csv', 'wt') as outf:
  fo = csv.writer(outf, lineterminator='\n')
  fo.writerow(labels)

  for i, pred in enumerate(predicted):
    fo.writerow([i] + list(pred))

In [26]:
predicted.shape

(884262, 39)