In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

In [3]:
crime_data = pd.read_csv("/Users/HOME/Desktop/LosAngelesCrime/data/datawrangling.csv",index_col = 0)
crime_data.head()

Unnamed: 0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,crm_cd,crm_cd_desc,mocodes,...,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,location,lat,lon,day_crime_occ,no_rptd_occ
0,1307355,2010-02-20,2010-02-20,13:50,13,Newton,1385,900,VIOLATION OF COURT ORDER,0913 1814 2000,...,SINGLE FAMILY DWELLING,500.0,UNKNOWN WEAPON/OTHER WEAPON,AA,Adult Arrest,300 E GAGE AV,33.9825,-118.2695,Saturday,0 days 00:00:00.000000000
1,11401303,2010-09-13,2010-09-12,00:45,14,Pacific,1485,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329,...,STREET,500.0,UNKNOWN WEAPON/OTHER WEAPON,IC,Invest Cont,SEPULVEDA BL,33.9599,-118.3962,Sunday,1 days 00:00:00.000000000
2,70309629,2010-08-09,2010-08-09,15:15,13,Newton,1324,946,OTHER MISCELLANEOUS CRIME,0344,...,ALLEY,500.0,UNKNOWN WEAPON/OTHER WEAPON,IC,Invest Cont,1300 E 21ST ST,34.0224,-118.2524,Monday,0 days 00:00:00.000000000
3,90631215,2010-01-05,2010-01-05,01:50,6,Hollywood,646,900,VIOLATION OF COURT ORDER,1100 0400 1402,...,STREET,102.0,HAND GUN,IC,Invest Cont,CAHUENGA BL,34.1016,-118.3295,Tuesday,0 days 00:00:00.000000000
4,100100501,2010-01-03,2010-01-02,21:00,1,Central,176,122,"RAPE, ATTEMPTED",0400,...,ALLEY,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,8TH ST,34.0387,-118.2488,Saturday,1 days 00:00:00.000000000


In [4]:
df = crime_data.copy()

In [5]:
df['Hour'] = pd.to_numeric(df['time_occ'].apply(lambda x : x[:2]))

In [6]:
df['Minutes'] = pd.to_numeric(df['time_occ'].apply(lambda x : x[-2:]))

In [7]:
df['no_rptd_occ'] = pd.to_numeric(df['no_rptd_occ'].apply(lambda x: x.split(" ")[0]))

In [8]:
df['day_crime_occ'] = pd.to_datetime(crime_data['date_occ']).dt.weekday

In [9]:
df['month_crime_occ']= pd.to_datetime(crime_data['date_occ']).dt.month

In [10]:
df['year_crime_occ'] = pd.to_datetime(crime_data['date_occ']).dt.year

In [11]:
df.select_dtypes(include=['object']).columns

Index(['date_rptd', 'date_occ', 'time_occ', 'area_name', 'crm_cd_desc',
       'mocodes', 'vict_sex', 'vict_descent', 'premis_desc', 'weapon_desc',
       'status', 'status_desc', 'location'],
      dtype='object')

In [12]:
len(df.location.unique())

75303

In [13]:
df.drop(['date_rptd','date_occ','time_occ','mocodes','rpt_dist_no'],axis =1,inplace = True)

In [51]:
df.to_csv('/Users/HOME/Desktop/LosAngelesCrime/data/step3_output.csv')

In [15]:
df.columns

Index(['dr_no', 'area', 'area_name', 'crm_cd', 'crm_cd_desc', 'vict_age',
       'vict_sex', 'vict_descent', 'premis_cd', 'premis_desc',
       'weapon_used_cd', 'weapon_desc', 'status', 'status_desc', 'location',
       'lat', 'lon', 'day_crime_occ', 'no_rptd_occ', 'Hour', 'Minutes',
       'month_crime_occ', 'year_crime_occ'],
      dtype='object')

In [16]:
df.drop(['crm_cd','premis_cd','weapon_used_cd','status'],axis=1,inplace = True)

In [17]:
df.select_dtypes(include=['object']).columns

Index(['area_name', 'crm_cd_desc', 'vict_sex', 'vict_descent', 'premis_desc',
       'weapon_desc', 'status_desc', 'location'],
      dtype='object')

#### print the categorical columns and their associated levels

In [16]:
dfo = df.select_dtypes(include=['object'])
dfo.shape
#get levels for all variables
vn = pd.DataFrame(dfo.nunique()).reset_index()
vn.columns = ['VarName', 'LevelsCount']
vn.sort_values(by='LevelsCount', ascending =False)
vn

Unnamed: 0,VarName,LevelsCount
0,area_name,21
1,crm_cd_desc,142
2,vict_sex,3
3,vict_descent,19
4,premis_desc,319
5,weapon_desc,79
6,status,6
7,status_desc,6
8,location,75303


### Stratified Sampling

In [17]:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
    
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)

    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']

            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            first = False
        else:
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            stratified_df = stratified_df.append(tmp_df, ignore_index=True)
    
    return stratified_df



def stratified_sample_report(df, strata, size=None):
    
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
    return tmp_grpd


def __smpl_size(population, size):
    
    if size is None:
        cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
        n = round(cochran_n/(1+((cochran_n -1) /population)))
    elif size >= 0 and size < 1:
        n = round(population * size)
    elif size < 0:
        raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
    elif size >= 1:
        n = size
    return n

#### Subsampling(20000 observations) the dataset because of its large size(2000000) using stratified sampling 

In [20]:
sample_df = stratified_sample(df, ['crm_cd_desc','area_name'], size=20000, seed=123, keep_index= True)
sample_df.head()

Unnamed: 0,index,dr_no,area,area_name,crm_cd,crm_cd_desc,vict_age,vict_sex,vict_descent,premis_cd,...,status_desc,location,lat,lon,day_crime_occ,no_rptd_occ,Hour,Minutes,month_crime_occ,year_crime_occ
0,1110886,151200099,12,77th Street,648,ARSON,48,M,Black,501.0,...,Invest Cont,900 W 69TH ST,33.9773,-118.2892,4,1,1,17,2,2015
1,349876,111212546,12,77th Street,648,ARSON,35,M,Hispanic/Latin/Mexican,502.0,...,Invest Cont,7900 S HOOVER ST,33.9678,-118.2871,6,0,6,0,5,2011
2,104180,101217118,12,77th Street,648,ARSON,37,F,Hispanic/Latin/Mexican,501.0,...,Invest Cont,800 E 82ND ST,33.9647,-118.2608,1,3,23,0,6,2010
3,2052286,190109104,1,Central,648,ARSON,0,M,Other,502.0,...,Invest Cont,HOPE,34.0423,-118.2631,4,0,6,30,3,2019
4,1594714,170105069,1,Central,648,ARSON,16,M,Other,753.0,...,Invest Cont,00100 N LOS ANGELES ST,34.0515,-118.2424,3,0,9,30,1,2017


In [21]:
sample_dfo = sample_df.select_dtypes(include=['object'])
sample_dfo.shape
#get levels for all variables
vn = pd.DataFrame(sample_dfo.nunique()).reset_index()
vn.columns = ['VarName', 'LevelsCount']
vn.sort_values(by='LevelsCount', ascending =False)
vn

Unnamed: 0,VarName,LevelsCount
0,area_name,21
1,crm_cd_desc,93
2,vict_sex,3
3,vict_descent,15
4,premis_desc,200
5,weapon_desc,63
6,status,6
7,status_desc,6
8,location,12522


In [22]:
from collections import Counter 
# summarize the class distribution
target = sample_df['crm_cd_desc'].values
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%d, Percentage  =  %.5f%%' % (k, v, per))

Class=ARSON, Count=33, Percentage  =  0.16568%
Class=ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER, Count=13, Percentage  =  0.06527%
Class=ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT, Count=878, Percentage  =  4.40807%
Class=ATTEMPTED ROBBERY, Count=113, Percentage  =  0.56733%
Class=BATTERY - SIMPLE ASSAULT, Count=1802, Percentage  =  9.04709%
Class=BATTERY POLICE (SIMPLE), Count=46, Percentage  =  0.23095%
Class=BATTERY WITH SEXUAL CONTACT, Count=108, Percentage  =  0.54222%
Class=BIKE - STOLEN, Count=137, Percentage  =  0.68782%
Class=BOMB SCARE, Count=9, Percentage  =  0.04519%
Class=BRANDISH WEAPON, Count=144, Percentage  =  0.72296%
Class=BUNCO, ATTEMPT, Count=3, Percentage  =  0.01506%
Class=BUNCO, GRAND THEFT, Count=85, Percentage  =  0.42675%
Class=BUNCO, PETTY THEFT, Count=51, Percentage  =  0.25605%
Class=BURGLARY, Count=1396, Percentage  =  7.00874%
Class=BURGLARY FROM VEHICLE, Count=1535, Percentage  =  7.70660%
Class=BURGLARY FROM VEHICLE, ATTEMPTED, Count=24, Percenta

### Handling Rare categories

##### Categories that are occures less than 6 times of the total observations are moved to OTHER CRIME category

In [30]:
value_counts = sample_df['crm_cd_desc'].value_counts() # Specific column 
to_remove = value_counts[value_counts <= 6].index
sample_df['crm_cd_desc'].replace(to_remove, 'OTHER MISCELLANEOUS CRIME', inplace=True)

In [31]:
sample_dfo = sample_df.select_dtypes(include=['object'])
sample_dfo.shape
#get levels for all variables
vn = pd.DataFrame(sample_dfo.nunique()).reset_index()
vn.columns = ['VarName', 'LevelsCount']
vn.sort_values(by='LevelsCount', ascending =False)
vn

Unnamed: 0,VarName,LevelsCount
0,area_name,21
1,crm_cd_desc,76
2,vict_sex,3
3,vict_descent,15
4,premis_desc,200
5,weapon_desc,63
6,status,6
7,status_desc,6
8,location,12522


In [33]:
len(sample_df[sample_df.crm_cd_desc == 'OTHER MISCELLANEOUS CRIME'])

232

## Create dummy features for categorical variables

### Binary encoding

#### Encoding the categorical variables using category_encoders

In [44]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['area_name', 'vict_sex', 'vict_descent', 'premis_desc',
                                 'weapon_desc', 'status_desc', 'location'])
sample_df_binary = encoder.fit_transform(sample_df)

sample_df_binary.head()

Unnamed: 0,index,dr_no,area,area_name_0,area_name_1,area_name_2,area_name_3,area_name_4,area_name_5,crm_cd,...,location_13,location_14,lat,lon,day_crime_occ,no_rptd_occ,Hour,Minutes,month_crime_occ,year_crime_occ
0,1110886,151200099,12,0,0,0,0,0,1,648,...,0,1,33.9773,-118.2892,4,1,1,17,2,2015
1,349876,111212546,12,0,0,0,0,0,1,648,...,1,0,33.9678,-118.2871,6,0,6,0,5,2011
2,104180,101217118,12,0,0,0,0,0,1,648,...,1,1,33.9647,-118.2608,1,3,23,0,6,2010
3,2052286,190109104,1,0,0,0,0,1,0,648,...,0,0,34.0423,-118.2631,4,0,6,30,3,2019
4,1594714,170105069,1,0,0,0,0,1,0,648,...,0,1,34.0515,-118.2424,3,0,9,30,1,2017


In [45]:
#deleting the unnecessary features
sample_df_binary.drop(['index','dr_no','area','status'],axis=1,inplace = True)

#### Idendifying correlated variables using pearson correlation

In [46]:
# Step 1. Call the variable corr_matrix
corr_matrix= sample_df_binary.corr().abs()

# Step 2. Uncomment the following code to get the upper triangle of the correlation matrix 
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Step 3. This code iterates through our columns and gets the index of any that have a correlation > 0.95
# Call the variable to_drop, get the columns of our 'upper' variable, make sure the threshold is 0.95.
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Let's see those features! 
print('Features selected to drop include:',to_drop)
print('Reduced dataframe size: ',sample_df_binary.drop(sample_df_binary[to_drop], axis=1).shape)

Features selected to drop include: ['lon']
Reduced dataframe size:  (19918, 61)


#### Split the data into test and train 

In [47]:
sample_df_binary.select_dtypes(include=['object']).columns

Index(['crm_cd_desc'], dtype='object')

In [48]:
# Split the data 
from sklearn.model_selection import train_test_split
X = sample_df_binary.drop(['crm_cd_desc'],axis= 1)
y = sample_df_binary.crm_cd_desc

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123,stratify = y)

#### Scale the data to prepare for model creation

In [50]:
#Scale the data
from sklearn.preprocessing import StandardScaler
import numpy as np
# build scaler based on training data and apply it to test data to then also scale the test data
scaler = StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [53]:
sample_df_binary.to_csv("/Users/HOME/Desktop/LosAngelesCrime/data/preprocess_output.csv")