# Data Cleaning and Exploratory Analysis

In [2]:
#import packages
import pandas as pd
import numpy as np

In [3]:
#read in data
landslide_df = pd.read_csv('Global_Landslide_Catalog_Export.csv')
#look at first 3 rows
landslide_df.head(3).T

Unnamed: 0,0,1,2
source_name,AGU,Oregonian,CBS News
source_link,https://blogs.agu.org/landslideblog/2008/10/14...,http://www.oregonlive.com/news/index.ssf/2009/...,https://www.cbsnews.com/news/dozens-missing-af...
event_id,684,956,973
event_date,08/01/2008 12:00:00 AM,01/02/2009 02:00:00 AM,01/19/2007 12:00:00 AM
event_time,,,
event_title,"Sigou Village, Loufan County, Shanxi Province","Lake Oswego, Oregon","San Ramon district, 195 miles northeast of the..."
event_description,"occurred early in morning, 11 villagers buried...",Hours of heavy rain are to blame for an overni...,(CBS/AP) At least 10 people died and as many a...
location_description,"Sigou Village, Loufan County, Shanxi Province","Lake Oswego, Oregon","San Ramon district, 195 miles northeast of the..."
location_accuracy,unknown,5km,10km
landslide_category,landslide,mudslide,landslide


In [4]:
# Remove columns not useful for modeling
landslide_df2 = landslide_df.drop(columns=['source_link', 'event_id', 'photo_link', 'submitted_date','created_date','last_edited_date'])
#look at first 3 rows
landslide_df2.head(3).T

Unnamed: 0,0,1,2
source_name,AGU,Oregonian,CBS News
event_date,08/01/2008 12:00:00 AM,01/02/2009 02:00:00 AM,01/19/2007 12:00:00 AM
event_time,,,
event_title,"Sigou Village, Loufan County, Shanxi Province","Lake Oswego, Oregon","San Ramon district, 195 miles northeast of the..."
event_description,"occurred early in morning, 11 villagers buried...",Hours of heavy rain are to blame for an overni...,(CBS/AP) At least 10 people died and as many a...
location_description,"Sigou Village, Loufan County, Shanxi Province","Lake Oswego, Oregon","San Ramon district, 195 miles northeast of the..."
location_accuracy,unknown,5km,10km
landslide_category,landslide,mudslide,landslide
landslide_trigger,rain,downpour,downpour
landslide_size,large,small,large


In [5]:
#look at missing data per column
landslide_df2.isna().sum()

source_name                      0
event_date                       0
event_time                   11033
event_title                      0
event_description              862
location_description           102
location_accuracy                2
landslide_category               1
landslide_trigger               23
landslide_size                   9
landslide_setting               69
fatality_count                1385
injury_count                  5674
storm_name                   10456
notes                        10716
event_import_source           1563
event_import_id               1562
country_name                  1562
country_code                  1564
admin_division_name           1637
admin_division_population     1562
gazeteer_closest_point        1563
gazeteer_distance             1562
longitude                        0
latitude                         0
dtype: int64

Since `event_time`has only missing data, we will drop this column.  
`storm_name` and `notes` have a lot of missing data so we will drop these columns as well.  
`injury_count` has almost 50% missing data but I am hesitant to drop this column since I think it might be a good predictor.

In [6]:
landslide_df3 = landslide_df2.drop(columns = ['event_time', 'storm_name', 'notes', 'event_import_source', 'event_import_id'])
#look at first 3 rows
landslide_df3.head(3).T

Unnamed: 0,0,1,2
source_name,AGU,Oregonian,CBS News
event_date,08/01/2008 12:00:00 AM,01/02/2009 02:00:00 AM,01/19/2007 12:00:00 AM
event_title,"Sigou Village, Loufan County, Shanxi Province","Lake Oswego, Oregon","San Ramon district, 195 miles northeast of the..."
event_description,"occurred early in morning, 11 villagers buried...",Hours of heavy rain are to blame for an overni...,(CBS/AP) At least 10 people died and as many a...
location_description,"Sigou Village, Loufan County, Shanxi Province","Lake Oswego, Oregon","San Ramon district, 195 miles northeast of the..."
location_accuracy,unknown,5km,10km
landslide_category,landslide,mudslide,landslide
landslide_trigger,rain,downpour,downpour
landslide_size,large,small,large
landslide_setting,mine,unknown,unknown


In [7]:
#look at missing data per column
landslide_df3.isna().sum()

source_name                     0
event_date                      0
event_title                     0
event_description             862
location_description          102
location_accuracy               2
landslide_category              1
landslide_trigger              23
landslide_size                  9
landslide_setting              69
fatality_count               1385
injury_count                 5674
country_name                 1562
country_code                 1564
admin_division_name          1637
admin_division_population    1562
gazeteer_closest_point       1563
gazeteer_distance            1562
longitude                       0
latitude                        0
dtype: int64

In [8]:
#view data frame
landslide_df3


Unnamed: 0,source_name,event_date,event_title,event_description,location_description,location_accuracy,landslide_category,landslide_trigger,landslide_size,landslide_setting,fatality_count,injury_count,country_name,country_code,admin_division_name,admin_division_population,gazeteer_closest_point,gazeteer_distance,longitude,latitude
0,AGU,08/01/2008 12:00:00 AM,"Sigou Village, Loufan County, Shanxi Province","occurred early in morning, 11 villagers buried...","Sigou Village, Loufan County, Shanxi Province",unknown,landslide,rain,large,mine,11.0,,China,CN,Shaanxi,0.0,Jingyang,41.02145,107.450000,32.562500
1,Oregonian,01/02/2009 02:00:00 AM,"Lake Oswego, Oregon",Hours of heavy rain are to blame for an overni...,"Lake Oswego, Oregon",5km,mudslide,downpour,small,unknown,0.0,,United States,US,Oregon,36619.0,Lake Oswego,0.60342,-122.663000,45.420000
2,CBS News,01/19/2007 12:00:00 AM,"San Ramon district, 195 miles northeast of the...",(CBS/AP) At least 10 people died and as many a...,"San Ramon district, 195 miles northeast of the...",10km,landslide,downpour,large,unknown,10.0,,Peru,PE,Junín,14708.0,San Ramón,0.85548,-75.358700,-11.129500
3,Reuters,07/31/2009 12:00:00 AM,Dailekh district,"One person was killed in Dailekh district, pol...",Dailekh district,unknown,landslide,monsoon,medium,unknown,1.0,,Nepal,NP,Mid Western,20908.0,Dailekh,0.75395,81.708000,28.837800
4,The Freeman,10/16/2010 12:00:00 PM,sitio Bakilid in barangay Lahug,Another landslide in sitio Bakilid in barangay...,sitio Bakilid in barangay Lahug,5km,landslide,tropical_cyclone,medium,unknown,0.0,,Philippines,PH,Central Visayas,798634.0,Cebu City,2.02204,123.897800,10.333600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11028,The Jakarta Post,04/01/2017 01:34:00 PM,Major landslide in Banaran,Landslide exacerbated by deforestation and bad...,"Banaran, Ponorogo, Jawa Timur, Indonesia",5km,landslide,rain,medium,natural_slope,27.0,0.0,,,,,,,111.679944,-7.853409
11029,Greater Kashmir,03/25/2017 05:32:00 PM,Barnari Sigdi Landslide,Two teenage girls died after they were buried ...,"Barnari Sigdi area, Tehsil Mughalmaidan, Kisht...",5km,landslide,other,small,natural_slope,2.0,0.0,,,,,,,75.680611,33.403080
11030,NBC Daily,12/15/2016 05:00:00 AM,Landslide at Pub Sarania Hill,An octogenarian was killed when a sudden lands...,"Pub Sarania Hill, Guwahati, Assam, India",1km,landslide,unknown,small,urban,1.0,0.0,,,,,,,91.772042,26.181606
11031,AGU Landslide Blog,04/29/2017 07:03:00 PM,Mayor landslide at Ayu village,Landslide triggered by heavy rainfall buried 1...,"Ayu, Ozgon, Osh, Kyrgyzstan",1km,translational_slide,downpour,large,natural_slope,24.0,,,,,,,,73.472379,40.886395


In [10]:
#impute some data
#we will replace missing event descriptions with the event titles and missing location descriptions with the country name, while this may cause some columns to be correlated, it makes the most sense for imputing the data as I don't wish to drop any data here
landslide_df3['event_description'] = landslide_df3['event_description'].fillna(landslide_df3['event_title'])
landslide_df3['location_description'] = landslide_df3['country_name'].fillna(landslide_df3['event_title'])

#view missing counts again:
landslide_df3.isna().sum()

source_name                     0
event_date                      0
event_title                     0
event_description               0
location_description            0
location_accuracy               2
landslide_category              1
landslide_trigger              23
landslide_size                  9
landslide_setting              69
fatality_count               1385
injury_count                 5674
country_name                 1562
country_code                 1564
admin_division_name          1637
admin_division_population    1562
gazeteer_closest_point       1563
gazeteer_distance            1562
longitude                       0
latitude                        0
dtype: int64

In [11]:
#change location accuracy to numeric by dropping 'km'
landslide_df4 = landslide_df3
landslide_df4['location_accuracy'] = landslide_df3['location_accuracy'].str.replace('km', '').replace('unknown', np.nan)
#force to numeric
landslide_df4['location_accuracy']= pd.to_numeric(landslide_df4['location_accuracy'], errors= 'coerce')
#get mean 
loc_acc_avg = landslide_df4['location_accuracy'].mean()
#replace missing values with mean
landslide_df4['location_accuracy'] = landslide_df4['location_accuracy'].fillna(loc_acc_avg)


In [36]:
#impute landslide cateory with mode
landslide_df5 = landslide_df4
landslide_df5['landslide_category'] = landslide_df5['landslide_category'].fillna(landslide_df4['landslide_category'].mode()[0])
#impute landslide trigger with mode
landslide_df5['landslide_trigger'] = landslide_df5['landslide_trigger'].fillna(landslide_df4['landslide_trigger'].mode()[0])
#impute landslide size with mode
landslide_df5['landslide_size'] = landslide_df5['landslide_size'].fillna(landslide_df4['landslide_size'].mode()[0])

In [43]:
landslide_df5['landslide_setting'] = landslide_df5['landslide_setting'].astype('category')
landslide_df5['landslide_setting'].cat.categories

Index(['complex', 'creep', 'debris_flow', 'earth_flow', 'lahar', 'landslide',
       'mudslide', 'other', 'riverbank_collapse', 'rock_fall',
       'snow_avalanche', 'topple', 'translational_slide', 'unknown'],
      dtype='object')

In [44]:
#replace missing setting with level unknown
landslide_df5['landslide_setting'] = landslide_df5['landslide_setting'].fillna('unknown')

In [52]:
# the remaining missing data seems to overlap a lot. While I could impute the country names/codes from the latitude and longitude I don't know enough about the data to imput the admin_division_names populations, and gazeeter attributes so I will drop all of these columns.
cols_drop = ['admin_division_name', 'admin_division_population', 'gazeteer_closest_point', 'gazeteer_distance']
landslide_df6 = landslide_df5.dropna(subset=cols_drop)

In [63]:
#view missing counts again
landslide_df6.isna().sum()

source_name                     0
event_date                      0
event_title                     0
event_description               0
location_description            0
location_accuracy               0
landslide_category              0
landslide_trigger               0
landslide_size                  0
landslide_setting               0
fatality_count               1323
injury_count                 5524
country_name                    0
country_code                    0
admin_division_name             0
admin_division_population       0
gazeteer_closest_point          0
gazeteer_distance               0
longitude                       0
latitude                        0
dtype: int64

In [57]:
#impute the 2 missing country codes from their country names
landslide_df6[landslide_df6['country_code'].isna()]
landslide_df6[landslide_df6['country_name'] == 'Namibia']
#since there are only 2 namibia entries with no country code I will replace the na with NAM
landslide_df6['country_code'] = landslide_df6['country_code'].fillna('NAM')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  landslide_df6['country_code'] = landslide_df6['country_code'].fillna('NAM')


In [64]:
#separate out the data frame with unknow fatality and unknown injury

fatality_unknown = landslide_df6[landslide_df6['fatality_count'].isna()]
fatality_unknown = fatality_unknown.drop(columns=['fatality_count', 'injury_count'])

injury_unknown = landslide_df6[landslide_df6['injury_count'].isna()]
injury_unknown = injury_unknown.drop(columns=['fatality_count', 'injury_count'])

In [None]:
# get dataframe for predicting with no missing values for injury or fatality

cols_drop = ['fatality_count', 'injury_count']

landslide_clean = landslide_df6.dropna(subset=cols_drop)



In [68]:
landslide_clean.isna().sum()

source_name                  0
event_date                   0
event_title                  0
event_description            0
location_description         0
location_accuracy            0
landslide_category           0
landslide_trigger            0
landslide_size               0
landslide_setting            0
fatality_count               0
injury_count                 0
country_name                 0
country_code                 0
admin_division_name          0
admin_division_population    0
gazeteer_closest_point       0
gazeteer_distance            0
longitude                    0
latitude                     0
dtype: int64

Next I need to deal with some of the columns that aren't categorical but are more of decriptions to see if I  can sort them into categories. For example, if a bunch of `event_descriptions` mention deaths, I and put them all in a category called deaths or something like that. This may be feasible, It may not be. we shall see...

In [74]:
#import packages for tokenizing and extracting words from descriptions
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize

In [84]:
landslide_clean

Unnamed: 0,source_name,event_date,event_title,event_description,location_description,location_accuracy,landslide_category,landslide_trigger,landslide_size,landslide_setting,fatality_count,injury_count,country_name,country_code,admin_division_name,admin_division_population,gazeteer_closest_point,gazeteer_distance,longitude,latitude
11,Oregon DOT,12/21/2014 08:10:00 AM,"US 30, milepost 29","US 30, milepost 29",United States,12.674393,landslide,unknown,unknown,landslide,0.0,0.0,United States,US,Oregon,12883.0,Saint Helens,0.87786,-122.817800,45.864700
14,ABS-CBN News,01/02/2009 08:30:00 PM,"Gadgaron village of Matnog town, Sorsogon",A landslide hit the village in barangay Gadgar...,Philippines,5.000000,landslide,downpour,medium,landslide,0.0,1.0,Philippines,PH,Bicol,6721.0,Matnog,5.53530,124.041900,12.565500
56,CNN,01/01/2010 12:00:00 AM,Mudslide crushed the Sankay Inn,Lodge and nearby houses were hit by a 300 m wi...,Brazil,25.000000,mudslide,downpour,large,mudslide,22.0,28.0,Brazil,BR,Rio de Janeiro,153635.0,Angra dos Reis,2.58880,-44.322498,-23.013744
57,Oregon DOT,02/06/2015 01:18:00 PM,"OR 42, milepost 42.2","OR 42, milepost 42.2",United States,12.674393,landslide,unknown,unknown,landslide,0.0,0.0,United States,US,Oregon,3742.0,Lafayette,3.64565,-123.158700,45.233500
121,Austrian Times,10/09/2013 12:00:00 AM,Kitzbühel,A German worker has been killed in a work acci...,Austria,10.000000,landslide,rain,medium,landslide,1.0,1.0,Austria,AT,Tyrol,8818.0,Kitzbühel,0.33828,12.396600,47.446000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9868,Huffington Post,12/02/2014 12:00:00 AM,Camarillo Springs,"On Tuesday, gushing water and muddy debris pou...",United States,5.000000,debris_flow,rain,medium,debris_flow,0.0,0.0,United States,US,California,3249.0,Casa Conejo,4.09154,-118.985800,34.194600
9869,Global News,10/24/2016 11:00:00 PM,"Golden : Two contractors working near Field, B...","Two contractors working near Field, B.C. were ...",Canada,10.000000,rock_fall,construction,medium,rock_fall,0.0,2.0,Canada,CA,British Columbia,4038.0,Golden,39.42196,-116.436200,51.419700
9871,Astro Awani,11/16/2015 09:00:00 AM,La Vie En Rose Restaurant,A landslip which occurred along a hilltop area...,Malaysia,12.674393,landslide,downpour,medium,landslide,0.0,0.0,Malaysia,MY,Kuala Lumpur,1453975.0,Kuala Lumpur,2.11669,101.703600,3.149700
9873,Dawn,01/20/2015 12:00:00 AM,the chromite mine of Khanozai,“A group of miners working in the chromite min...,Pakistan,5.000000,landslide,downpour,medium,landslide,2.0,3.0,Pakistan,PK,Balochistān,7630.0,Alik Ghund,20.05530,67.379900,30.621800


In [85]:
landslide_clean.event_description[11]

'US 30, milepost 29'

In [87]:
sentences = sent_tokenize(landslide_clean.event_description[11])
word = [word_tokenize(sentence) for sentence in sentences]

In [92]:
!pip install spacy

Collecting spacy
  Using cached spacy-3.8.2.tar.gz (1.3 MB)
  Installing build dependencies ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install build dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[441 lines of output][0m
  [31m   [0m Ignoring numpy: markers 'python_version < "3.9"' don't match your environment
  [31m   [0m Collecting setuptools
  [31m   [0m   Using cached setuptools-75.6.0-py3-none-any.whl.metadata (6.7 kB)
  [31m   [0m Collecting cython<3.0,>=0.25
  [31m   [0m   Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
  [31m   [0m Collecting cymem<2.1.0,>=2.0.2
  [31m   [0m   Using cached cymem-2.0.10-cp313-cp313-macosx_11_0_arm64.whl.metadata (8.4 kB)
  [31m   [0m Collecting preshed<3.1.0,>=3.0.2
  [31m   [0m   Using cached preshed-3.0.9-cp313-cp313-macosx_10_13_universal2.whl
  [31m   [0m Collecting murmurhash<1.1.0

In [90]:
import spacy

ModuleNotFoundError: No module named 'spacy'