In [19]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
import datetime
    

In [20]:
df = pd.read_csv('nyc.csv',header=0)

In [21]:
df["Borough"] = df["Borough"].str.replace('\d+', '')
df["Borough"] = df["Borough"].str.replace('Unspecified', '')    
df["Borough"] = df["Borough"].str.strip()
df["borough_num"] = df["Borough"].map({"MANHATTAN":1, "BROOKLYN":2, "QUEENS":3, "STATEN ISLAND":4, "BRONX":5})
df["borough_num"].fillna(0, inplace= True)
df['borough_num'] = df['borough_num'].astype(int)
df = df[df["borough_num"] != 0]

In [22]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [23]:
df["Created Date"] = pd.to_datetime(df["Created Date"])

In [24]:
df["Created Date"].dtype

dtype('<M8[ns]')

In [25]:
df['month'] = df['Created Date'].apply(lambda x:x.month)

In [26]:
df['hour'] = df['Created Date'].apply(lambda x:x.hour)

In [27]:
df['weekday'] = df['Created Date'].apply(lambda x:x.weekday())

In [28]:
df.head()

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Descriptor,Borough,processing_time,borough_num,month,hour,weekday
0,32305299,2016-01-01 00:00:09,2016-01-01 01:57:32,NYPD,Loud Music/Party,BROOKLYN,0.081516,2,1,0,4
1,32310343,2016-01-01 00:00:40,2016-01-01 03:12:53,NYPD,Loud Music/Party,BRONX,0.133484,5,1,0,4
2,32309107,2016-01-01 00:01:09,2016-01-21 09:20:55,HPD,NO LIGHTING,BRONX,20.388727,5,1,0,4
4,32309212,2016-01-01 00:03:03,2016-01-08 01:13:00,HPD,ENTIRE BUILDING,BRONX,7.048576,5,1,0,4
5,32305983,2016-01-01 00:03:03,2016-01-01 03:24:46,NYPD,Loud Music/Party,QUEENS,0.140081,3,1,0,4


In [29]:
df["Agency"].unique()

array(['NYPD', 'HPD', 'DOHMH', 'DOT', 'DSNY', 'DEP', 'DOB', 'TLC', 'FDNY',
       'DPR', 'EDC', 'DOF', 'DCA', 'DFTA', 'DOE', '3-1-1'], dtype=object)

In [30]:
agency_num = {}
for num, agency in enumerate((df['Agency'].unique())): 
    agency_num[agency] = num
df['agency_num'] = df['Agency'].apply(lambda x: agency_num[x])

In [31]:
def transform_processing(time):
    if time > 1:
        return 1
    else:
        return 0

In [32]:
df["processing_time_window"] = df["processing_time"].apply(transform_processing)

In [33]:
df.head()

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Descriptor,Borough,processing_time,borough_num,month,hour,weekday,agency_num,processing_time_window
0,32305299,2016-01-01 00:00:09,2016-01-01 01:57:32,NYPD,Loud Music/Party,BROOKLYN,0.081516,2,1,0,4,0,0
1,32310343,2016-01-01 00:00:40,2016-01-01 03:12:53,NYPD,Loud Music/Party,BRONX,0.133484,5,1,0,4,0,0
2,32309107,2016-01-01 00:01:09,2016-01-21 09:20:55,HPD,NO LIGHTING,BRONX,20.388727,5,1,0,4,1,1
4,32309212,2016-01-01 00:03:03,2016-01-08 01:13:00,HPD,ENTIRE BUILDING,BRONX,7.048576,5,1,0,4,1,1
5,32305983,2016-01-01 00:03:03,2016-01-01 03:24:46,NYPD,Loud Music/Party,QUEENS,0.140081,3,1,0,4,0,0


In [48]:
# Question 1: transforming the data
def filter_nyc():
    df = pd.read_csv('nyc.csv',header=0)

    # Your work here
    df.drop("Unnamed: 0", axis=1, inplace=True)
    df["Created Date"] = pd.to_datetime(df["Created Date"])
    df['month'] = df['Created Date'].apply(lambda x:x.month)
    df['hour'] = df['Created Date'].apply(lambda x:x.hour)
    df['weekday'] = df['Created Date'].apply(lambda x:x.isoweekday())

    agency_num = {}
    for num, agency in enumerate((df['Agency'].unique())): 
         agency_num[agency] = num
    df['agency_num'] = df['Agency'].apply(lambda x: agency_num[x])
    
    df["Borough"] = df["Borough"].str.replace('\d+', '')
    df["Borough"] = df["Borough"].str.replace('Unspecified', '')    
    df["Borough"] = df["Borough"].str.strip()
    df["borough_num"] = df["Borough"].map({"MANHATTAN":1, "BROOKLYN":2, "QUEENS":3, "STATEN ISLAND":4, "BRONX":5})
    df["borough_num"].fillna(0, inplace= True)
    df['borough_num'] = df['borough_num'].astype(int)
    
    def transform_processing(time):
        if time > 1:
            return 1
        else:
            return 0
    
    df["processing_time_bucket"] = df["processing_time"].apply(transform_processing)
    
    
    
    df[['hour', 'month', 'weekday', 'agency_num', 'borough_num', 'processing_time_bucket']].to_csv('filtered.csv',index=False)

In [49]:
filter_nyc()

In [63]:
# Question 2: build the model and predict
def build_and_predict():
    data = pd.read_csv('filtered.csv')
    test = pd.read_csv('topredict.csv')
    y = data["processing_time_bucket"]
    X = data.drop("processing_time_bucket", axis=1)
    # Your work here
    reg = linear_model.LogisticRegression()
    reg.fit(X, y)
    predict = reg.predict(test)
    frame = pd.DataFrame(predict, columns=[["predictions"]])
    frame.to_csv('predictions.csv',index=True,index_label='index')

In [37]:
data = pd.read_csv('filtered.csv')
test = pd.read_csv('topredict.csv')

In [40]:
y = data["processing_time_bucket"]

In [42]:
X = data.drop("processing_time_bucket", axis=1)

In [50]:
reg = linear_model.LogisticRegression()

In [51]:
reg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
predict = reg.predict(test)

In [53]:
predict

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0])

In [58]:
frame = pd.DataFrame(predict, columns=[["predictions"]])

In [60]:
frame.index.name = "index"

In [61]:
frame.head()

Unnamed: 0_level_0,predictions
index,Unnamed: 1_level_1
0,1
1,0
2,0
3,1
4,1


In [64]:
build_and_predict()