In [78]:
import pandas as pd
import numpy as np

Load csv into dataframe

In [79]:
landslide = pd.read_csv('landslide_database.csv')

Coverage column defined by numbers, convert to coverage names:

In [80]:
cover = {11:'Open Water',12:'Perennial Ice/Snow',21:'Developed, Open',22:'Developed, Low',23:'Developed, Medium',24:'Developed, High',31:'Barren Land',41:'Deciduous Forest',42:'Evergreen Forest',43:'Mixed Forest',52:'Shrub',71:'Grassland',72:'Sedge',73:'Lichens',74:'Moss',51:'Dwarf Scrub',81:'Pasture/Hay',82:'Cultivated Crops',90:'Woody Wetlands',95:'Herbaceous Wetlands'}
landslide = landslide.replace({'COVERAGE': cover})

Precipitation is a string since some entries are 'T' for trace, convert T to 0:

In [81]:
landslide['precipi'][landslide['precipi'] == 'T'] = 0
landslide['precipi'] = landslide['precipi'].convert_objects(convert_numeric=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
  from ipykernel import kernelapp as app


Select features to include in final analysis

In [82]:
features = landslide[['SLIDE','AGE_NAME','GEO_GENL_U','COVERAGE','Slope','precipi','Precip_Mon','Soil_Moist','Soil_Moist_Depth','Vegetation','Soil_Temp','maxwspdi','maxtempi','mintempi','maxhumidity','maxpressurei']]


Get Dummy Variables for categorical columns

In [83]:
add = pd.get_dummies(landslide[['AGE_NAME','GEO_GENL_U','COVERAGE']])

Add dummy variables to dataframe

In [84]:
lands = [features,add]
landslides = pd.concat(lands,axis=1)

In [85]:
list(landslides)

['SLIDE',
 'AGE_NAME',
 'GEO_GENL_U',
 'COVERAGE',
 'Slope',
 'precipi',
 'Precip_Mon',
 'Soil_Moist',
 'Soil_Moist_Depth',
 'Vegetation',
 'Soil_Temp',
 'maxwspdi',
 'maxtempi',
 'mintempi',
 'maxhumidity',
 'maxpressurei',
 'AGE_NAME_Cretaceous',
 'AGE_NAME_Eocene',
 'AGE_NAME_Eocene/Miocene',
 'AGE_NAME_Eocene/Oligocene',
 'AGE_NAME_Jurassic',
 'AGE_NAME_Jurassic/Cretaceous',
 'AGE_NAME_Miocene',
 'AGE_NAME_Miocene/Pleistocene',
 'AGE_NAME_Miocene/Pliocene',
 'AGE_NAME_Oligocene',
 'AGE_NAME_Oligocene/Miocene',
 'AGE_NAME_Paleocene/Eocene',
 'AGE_NAME_Pleistocene',
 'AGE_NAME_Pliocene',
 'AGE_NAME_Pliocene/Pleistocene',
 'AGE_NAME_Quaternary',
 'AGE_NAME_Recent',
 'AGE_NAME_no data',
 'GEO_GENL_U_intrusive rocks',
 'GEO_GENL_U_invasive extrusive rocks',
 'GEO_GENL_U_marine sedimentary rocks',
 'GEO_GENL_U_marine volcanic rocks',
 'GEO_GENL_U_melange rocks',
 'GEO_GENL_U_metamorphic rocks',
 'GEO_GENL_U_no data',
 'GEO_GENL_U_sediments',
 'GEO_GENL_U_terrestrial sedimentary rocks',
 

Split dataframe in training and testing data

In [86]:
msk = np.random.rand(len(landslides)) < 0.7
train = landslides[msk]
test = landslides[~msk]

Check length of training and testing data

In [87]:
len(train)

1690

In [88]:
len(test)

741

Make sure it's an even aplit between slide and no slide

In [89]:
len(test[test['SLIDE']==1])

379

Define X and y for models

In [90]:
X_train = train[['Slope',
 'precipi',
 'Precip_Mon',
 'Soil_Moist',
 'Soil_Moist_Depth',
 'Vegetation',
 'Soil_Temp',
 'maxwspdi',
 'maxtempi',
 'mintempi',
 'maxhumidity',
 'maxpressurei',
 'AGE_NAME_Cretaceous',
 'AGE_NAME_Eocene',
 'AGE_NAME_Eocene/Miocene',
 'AGE_NAME_Eocene/Oligocene',
 'AGE_NAME_Jurassic',
 'AGE_NAME_Jurassic/Cretaceous',
 'AGE_NAME_Miocene',
 'AGE_NAME_Miocene/Pleistocene',
 'AGE_NAME_Miocene/Pliocene',
 'AGE_NAME_Oligocene',
 'AGE_NAME_Oligocene/Miocene',
 'AGE_NAME_Paleocene/Eocene',
 'AGE_NAME_Pleistocene',
 'AGE_NAME_Pliocene',
 'AGE_NAME_Pliocene/Pleistocene',
 'AGE_NAME_Quaternary',
 'AGE_NAME_Recent',
 'AGE_NAME_no data',
 'GEO_GENL_U_intrusive rocks',
 'GEO_GENL_U_invasive extrusive rocks',
 'GEO_GENL_U_marine sedimentary rocks',
 'GEO_GENL_U_marine volcanic rocks',
 'GEO_GENL_U_melange rocks',
 'GEO_GENL_U_metamorphic rocks',
 'GEO_GENL_U_no data',
 'GEO_GENL_U_sediments',
 'GEO_GENL_U_terrestrial sedimentary rocks',
 'GEO_GENL_U_vent and pyroclastic rocks',
 'GEO_GENL_U_volcanic rocks',
 'GEO_GENL_U_volcaniclastic rocks',
 'COVERAGE_Barren Land',
 'COVERAGE_Cultivated Crops',
 'COVERAGE_Deciduous Forest',
 'COVERAGE_Developed, High',
 'COVERAGE_Developed, Low',
 'COVERAGE_Developed, Medium',
 'COVERAGE_Developed, Open',
 'COVERAGE_Evergreen Forest',
 'COVERAGE_Grassland',
 'COVERAGE_Herbaceous Wetlands',
 'COVERAGE_Mixed Forest',
 'COVERAGE_Open Water',
 'COVERAGE_Pasture/Hay',
 'COVERAGE_Shrub',
 'COVERAGE_Woody Wetlands']]
X_test = test[['Slope',
 'precipi',
 'Precip_Mon',
 'Soil_Moist',
 'Soil_Moist_Depth',
 'Vegetation',
 'Soil_Temp',
 'maxwspdi',
 'maxtempi',
 'mintempi',
 'maxhumidity',
 'maxpressurei',
 'AGE_NAME_Cretaceous',
 'AGE_NAME_Eocene',
 'AGE_NAME_Eocene/Miocene',
 'AGE_NAME_Eocene/Oligocene',
 'AGE_NAME_Jurassic',
 'AGE_NAME_Jurassic/Cretaceous',
 'AGE_NAME_Miocene',
 'AGE_NAME_Miocene/Pleistocene',
 'AGE_NAME_Miocene/Pliocene',
 'AGE_NAME_Oligocene',
 'AGE_NAME_Oligocene/Miocene',
 'AGE_NAME_Paleocene/Eocene',
 'AGE_NAME_Pleistocene',
 'AGE_NAME_Pliocene',
 'AGE_NAME_Pliocene/Pleistocene',
 'AGE_NAME_Quaternary',
 'AGE_NAME_Recent',
 'AGE_NAME_no data',
 'GEO_GENL_U_intrusive rocks',
 'GEO_GENL_U_invasive extrusive rocks',
 'GEO_GENL_U_marine sedimentary rocks',
 'GEO_GENL_U_marine volcanic rocks',
 'GEO_GENL_U_melange rocks',
 'GEO_GENL_U_metamorphic rocks',
 'GEO_GENL_U_no data',
 'GEO_GENL_U_sediments',
 'GEO_GENL_U_terrestrial sedimentary rocks',
 'GEO_GENL_U_vent and pyroclastic rocks',
 'GEO_GENL_U_volcanic rocks',
 'GEO_GENL_U_volcaniclastic rocks',
 'COVERAGE_Barren Land',
 'COVERAGE_Cultivated Crops',
 'COVERAGE_Deciduous Forest',
 'COVERAGE_Developed, High',
 'COVERAGE_Developed, Low',
 'COVERAGE_Developed, Medium',
 'COVERAGE_Developed, Open',
 'COVERAGE_Evergreen Forest',
 'COVERAGE_Grassland',
 'COVERAGE_Herbaceous Wetlands',
 'COVERAGE_Mixed Forest',
 'COVERAGE_Open Water',
 'COVERAGE_Pasture/Hay',
 'COVERAGE_Shrub',
 'COVERAGE_Woody Wetlands']]

In [91]:
y_train = train[['SLIDE']]
y_test = test[['SLIDE']]

Save to csv

In [92]:
X_train.to_csv('X.csv',index=False)
X_test.to_csv('X_test.csv',index=False)
y_train.to_csv('y.csv',index=False)
y_test.to_csv('y_test.csv',index=False)

In [93]:
X_whole = landslides[['Slope',
 'precipi',
 'Precip_Mon',
 'Soil_Moist',
 'Soil_Moist_Depth',
 'Vegetation',
 'Soil_Temp',
 'maxwspdi',
 'maxtempi',
 'mintempi',
 'maxhumidity',
 'maxpressurei',
 'AGE_NAME_Cretaceous',
 'AGE_NAME_Eocene',
 'AGE_NAME_Eocene/Miocene',
 'AGE_NAME_Eocene/Oligocene',
 'AGE_NAME_Jurassic',
 'AGE_NAME_Jurassic/Cretaceous',
 'AGE_NAME_Miocene',
 'AGE_NAME_Miocene/Pleistocene',
 'AGE_NAME_Miocene/Pliocene',
 'AGE_NAME_Oligocene',
 'AGE_NAME_Oligocene/Miocene',
 'AGE_NAME_Paleocene/Eocene',
 'AGE_NAME_Pleistocene',
 'AGE_NAME_Pliocene',
 'AGE_NAME_Pliocene/Pleistocene',
 'AGE_NAME_Quaternary',
 'AGE_NAME_Recent',
 'AGE_NAME_no data',
 'GEO_GENL_U_intrusive rocks',
 'GEO_GENL_U_invasive extrusive rocks',
 'GEO_GENL_U_marine sedimentary rocks',
 'GEO_GENL_U_marine volcanic rocks',
 'GEO_GENL_U_melange rocks',
 'GEO_GENL_U_metamorphic rocks',
 'GEO_GENL_U_no data',
 'GEO_GENL_U_sediments',
 'GEO_GENL_U_terrestrial sedimentary rocks',
 'GEO_GENL_U_vent and pyroclastic rocks',
 'GEO_GENL_U_volcanic rocks',
 'GEO_GENL_U_volcaniclastic rocks',
 'COVERAGE_Barren Land',
 'COVERAGE_Cultivated Crops',
 'COVERAGE_Deciduous Forest',
 'COVERAGE_Developed, High',
 'COVERAGE_Developed, Low',
 'COVERAGE_Developed, Medium',
 'COVERAGE_Developed, Open',
 'COVERAGE_Evergreen Forest',
 'COVERAGE_Grassland',
 'COVERAGE_Herbaceous Wetlands',
 'COVERAGE_Mixed Forest',
 'COVERAGE_Open Water',
 'COVERAGE_Pasture/Hay',
 'COVERAGE_Shrub',
 'COVERAGE_Woody Wetlands']]
y_whole = landslides[['SLIDE']]

In [94]:
X_whole.to_csv("X_whole.csv",index=False)
y_whole.to_csv("y_whole.csv",index=False)