In [12]:
from preprocessing import data_collection_and_merging, data_preprocessing
from scaling_and_sampling import scaling_train_test_split, sampling_and_baseline_model
from model_selection import models_and_hyperparameter_tuning

In [2]:
# Data Collection and Merging
storm_events_file = 'StormEvents_details-ftp_v1.0_d'
income_data_file = 'Median_Household_Income.csv'
population_data_file = 'Population_Density.csv'
data_collection = data_collection_and_merging
tornado_data = data_collection.get_tornado_data(storm_events_file)
income_data = data_collection.get_income_data(income_data_file)
population_data = data_collection.get_population_data(population_data_file)
merged_data = data_collection.merge_data(tornado_data, income_data, population_data)

In [3]:
# Data Preprocessing
data_processing = data_preprocessing(merged_data)
data_processing.day_of_year_week_weekend()
data_processing.drop_and_rename_columns()
data_processing.create_duration()
data_processing.create_casualties_column()
data_processing.calc_tornado_area()
data_processing.calc_min_and_avg_range()
data_processing.calc_avg_lat_and_long()
data_processing.calc_percentage_land()
data_processing.extract_multi_vortex_ref()
data_processing.fillna()
data_processing.sin_and_cosine_time()
data_processing.binary_tornado_intensity_estimate()

Since the 93rd percentile of casualties is 1, we'll binarize casualties to predict whether or not tornadoes will have casualties.

In [4]:
data_processing.data['casualties'].describe(percentiles = [.25, .5, .75, .9, .93,.95])

count    14557.000000
mean         0.941952
std         16.158968
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
90%          0.000000
93%          1.000000
95%          2.000000
max       1311.000000
Name: casualties, dtype: float64

In [5]:
data_processing.binarize_casualties()
processed_data = data_processing.data

In [14]:
scaling_and_train_test_split = scaling_train_test_split
train, val, test = scaling_and_train_test_split.train_and_test_split(processed_data)

Checking for percentage of class 1 samples in training, validation and test sets. Since the percentage is fairly similar, we don't have to stratify the data further and can go ahead and scale the data.

In [15]:
len(train['binary_casualties'][train['binary_casualties'] == 1])/len(train)

0.08321167883211679

In [16]:
len(val['binary_casualties'][val['binary_casualties'] == 1])/len(val)

0.053595968850206135

In [17]:
len(test['binary_casualties'][test['binary_casualties'] == 1])/len(test)

0.07402618393405527

In [18]:
scaled_trainX, trainY, scaled_valX, valY, scaled_testX, testY = scaling_and_train_test_split.scaling(train, val, test)

In [19]:
sampling = sampling_and_baseline_model

In [20]:
sampling.random_oversampling(scaled_trainX, trainY, scaled_valX, valY)

0.7658115521135851
[[1787  279]
 [  39   78]]


In [21]:
sampling.SMOTE_oversampling(scaled_trainX, trainY, scaled_valX, valY)

0.7571052696899744
[[1804  262]
 [  42   75]]


In [22]:
sampling.ADASYN_oversampling(scaled_trainX, trainY, scaled_valX, valY)

0.7549912709641655
[[1654  412]
 [  34   83]]


In [23]:
sampling.random_undersampling(scaled_trainX, trainY, scaled_valX, valY)

0.7529848338173604
[[1734  332]
 [  39   78]]


In [24]:
sampling.Edited_NN_undersampling(scaled_trainX, trainY, scaled_valX, valY)

0.739138349012502
[[1924  142]
 [  53   64]]


Using Random Oversampling as it's performs the best and undersampling might reduce the size of the dataset (12K rows) even further.

In [25]:
models = models_and_hyperparameter_tuning

In [26]:
C_values = [0.0001, 0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
models.Logistic_Regression(scaled_trainX, trainY, C_values, penalty)

5it [00:06,  1.19s/it]


Unnamed: 0,C_values,Penalty,Mean_AUC,Standard_Deviation_AUC
2,0.001,l1,0.780239,0.032806


In [27]:
min_samples_split_values = list(range(2,400,40))
maxdepth = list(range(2,400,40))
models.Decision_Trees(scaled_trainX, trainY, min_samples_split_values, maxdepth)

5it [00:55, 11.00s/it]


Unnamed: 0,Min_Samples_Split,Max_Depth,Mean_AUC,Standard_Deviation_AUC
0,2,2,0.778499,0.031487
10,42,2,0.778499,0.031487
20,82,2,0.778499,0.031487
30,122,2,0.778499,0.031487
40,162,2,0.778499,0.031487
50,202,2,0.778499,0.031487
60,242,2,0.778499,0.031487
70,282,2,0.778499,0.031487
80,322,2,0.778499,0.031487
90,362,2,0.778499,0.031487


In [28]:
estimators = [1,10,50,100]
max_depth = [2,5,10,20]
min_split = [2,42,162,362]
models.Random_Forest(scaled_trainX, trainY,estimators, max_depth, min_split)

3it [01:24, 28.14s/it]


Unnamed: 0,No_of_Estimators,Max_Depth,Min_Split,Mean_AUC,Standard_Deviation_AUC
20,10,5,2,0.786436,0.033457


Logistic regression models performs the best and will be evaluated on the test set.

Combine training and validation sets