# Machine Learning Project
## Scott Burgholzer

Data Name: Storm Events Database

Data source: https://www.ncdc.noaa.gov/stormevents/ftp.jsp

used the files that start with StormEvents_details

Only used years 2007 - 2018

In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn import cross_validation
from sklearn import preprocessing

# these are the fields in the files that we are intersted in
fields = ['YEAR', 'EVENT_TYPE', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT', 
          'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'TOR_F_SCALE', 'TOR_LENGTH', 'TOR_WIDTH']

# Loop through all the files and create a dataframe object
path ='/Users/scottburgholzer/Desktop/Project'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []

for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0, usecols=fields)
    list_.append(df)
frame = pd.concat(list_)

# sort by year for confirmation
frame = frame.sort_values('YEAR')

# keep only the tornado data
frame = frame[frame.EVENT_TYPE == 'Tornado']

# we no longer need the year or event type
frame = frame.drop(['YEAR', 'EVENT_TYPE'], axis=1)

# remove EFU (unknown), F0, F1, F2 as we only want EF
frame = frame[frame.TOR_F_SCALE != "EFU"]
frame = frame[frame.TOR_F_SCALE != "F0"]
frame = frame[frame.TOR_F_SCALE != "F1"]
frame = frame[frame.TOR_F_SCALE != "F2"]

frame2 = frame

# randomize the dataframe
np.random.seed(47)
frame = frame.reindex(np.random.permutation(frame.index))

print frame.head()
print frame.tail()



       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
52239                0                  0              0                0   
37149                0                  0              0                0   
8412                 0                  0              0                0   
59566                0                  0              0                0   
70432                0                  0              0                0   

      DAMAGE_PROPERTY DAMAGE_CROPS TOR_F_SCALE  TOR_LENGTH  TOR_WIDTH  
52239           0.00K        0.00K         EF0        2.50       50.0  
37149          30.00K        0.00K         EF0        1.19      100.0  
8412            0.00K        0.00K         EF2        9.00      250.0  
59566           0.00K        0.00K         EF0        4.88       80.0  
70432          10.00K        0.00K         EF0        0.10      100.0  
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
28595                0      

In [2]:
# Make all NaN values in DAMAGE_PROPERTY and DAMAGE_CROPS columns set to 0.0K
frame.DAMAGE_PROPERTY = frame.DAMAGE_PROPERTY.fillna('0.0K')
frame.DAMAGE_CROPS = frame.DAMAGE_CROPS.fillna('0.0K')

# this function takes values such as 10.00K and makes it 10000.0
def value_to_float(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    if 'B' in x:
        return float(x.replace('B', '')) * 1000000000
    return 0.0

# convert all money values to float
frame.DAMAGE_PROPERTY = frame.DAMAGE_PROPERTY.apply(value_to_float)
frame.DAMAGE_CROPS = frame.DAMAGE_CROPS.apply(value_to_float)

# Just to be safe, make sure any additional NaN are set to 0
frame = frame.fillna(0)

print frame.head()
print frame.tail()


       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
52239                0                  0              0                0   
37149                0                  0              0                0   
8412                 0                  0              0                0   
59566                0                  0              0                0   
70432                0                  0              0                0   

       DAMAGE_PROPERTY  DAMAGE_CROPS TOR_F_SCALE  TOR_LENGTH  TOR_WIDTH  
52239              0.0           0.0         EF0        2.50       50.0  
37149          30000.0           0.0         EF0        1.19      100.0  
8412               0.0           0.0         EF2        9.00      250.0  
59566              0.0           0.0         EF0        4.88       80.0  
70432          10000.0           0.0         EF0        0.10      100.0  
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
28595           

In [3]:
#One hot encoding
y_orig = frame['TOR_F_SCALE']
y = pd.get_dummies(frame['TOR_F_SCALE']).as_matrix()
frame = frame.drop('TOR_F_SCALE', axis=1)

print y.shape
print frame.shape

print frame.head()
print frame.tail()
print y


(15685, 6)
(15685, 8)
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
52239                0                  0              0                0   
37149                0                  0              0                0   
8412                 0                  0              0                0   
59566                0                  0              0                0   
70432                0                  0              0                0   

       DAMAGE_PROPERTY  DAMAGE_CROPS  TOR_LENGTH  TOR_WIDTH  
52239              0.0           0.0        2.50       50.0  
37149          30000.0           0.0        1.19      100.0  
8412               0.0           0.0        9.00      250.0  
59566              0.0           0.0        4.88       80.0  
70432          10000.0           0.0        0.10      100.0  
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
28595                0                  0              0          

In [4]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    frame, y, test_size=0.2, random_state=50)
print X_train.shape
print y_train.shape

(12548, 8)
(12548, 6)


In [5]:
print X_train.head()
print X_train.tail()
print y_train

       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
1592                 0                  0              0                0   
42078                0                  0              0                0   
7770                 0                  0              0                0   
29772                0                  0              0                0   
34924                0                  0              0                0   

       DAMAGE_PROPERTY  DAMAGE_CROPS  TOR_LENGTH  TOR_WIDTH  
1592           50000.0           0.0        0.47       75.0  
42078           4000.0           0.0        0.15      300.0  
7770          200000.0      500000.0       11.92      100.0  
29772              0.0           0.0        0.75       75.0  
34924          30000.0           0.0        3.16      150.0  
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
2014                 0                  0              0                0   
15394      

In [6]:
stdscaler = preprocessing.StandardScaler()
X_scaled  = stdscaler.fit_transform(frame)
X_train_scaled = stdscaler.transform(X_train)
X_test_scaled  = stdscaler.transform(X_test)

# Week 7 Update

# Naive Bayes classifier

In [7]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import classification_report, confusion_matrix
gnb = GaussianNB().fit(X_train_scaled, y_train.argmax(axis=1)) 
gnb_predictions = gnb.predict(X_test_scaled) 

print "Naive Bayes Classifier Results\nConfusion Matrix:"
print confusion_matrix(y_test.argmax(axis=1), gnb_predictions) 
print "\n Classifcation Report"
print classification_report(y_test.argmax(axis=1),gnb_predictions)

Naive Bayes Classifier Results
Confusion Matrix:
[[1629   29    3    0    0    0]
 [ 876  146   18    4    1    3]
 [ 137  112   55    5    0    3]
 [   9   22   34    3    0   17]
 [   0    6    8    1    1   13]
 [   0    0    0    0    0    2]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.61      0.98      0.76      1661
          1       0.46      0.14      0.21      1048
          2       0.47      0.18      0.26       312
          3       0.23      0.04      0.06        85
          4       0.50      0.03      0.06        29
          5       0.05      1.00      0.10         2

avg / total       0.54      0.59      0.50      3137



# Logistic Regression

In [8]:
from sklearn import linear_model
from sklearn import metrics
from mlclass2 import simplemetrics, plot_decision_2d_lda
from sklearn.decomposition import PCA

lr = linear_model.LogisticRegression()
lr.fit(X_train_scaled, y_train.argmax(axis=1))
predicted = lr.predict(X_test_scaled)

In [9]:
print "Logistic Regression Results\nConfusion Matrix:"
print confusion_matrix(y_test.argmax(axis=1), predicted)

print "\n Classifcation Report"
print classification_report(y_test.argmax(axis=1),predicted)

Logistic Regression Results
Confusion Matrix:
[[1563   98    0    0    0    0]
 [ 634  410    4    0    0    0]
 [  59  236   15    2    0    0]
 [   2   46   32    4    1    0]
 [   0    9   10    9    1    0]
 [   0    0    0    1    0    1]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.69      0.94      0.80      1661
          1       0.51      0.39      0.44      1048
          2       0.25      0.05      0.08       312
          3       0.25      0.05      0.08        85
          4       0.50      0.03      0.06        29
          5       1.00      0.50      0.67         2

avg / total       0.57      0.64      0.58      3137



# SVM

In [10]:
from sklearn.svm import SVC 
svm_model = SVC(kernel = 'rbf', gamma = 1, C = 1).fit(X_train_scaled, y_train.argmax(axis=1)) 
svm_predictions = svm_model.predict(X_test_scaled) 

In [11]:
from sklearn.metrics import hinge_loss
# model accuracy for X_test   
#accuracy = svm_model_linear.score(X_test, y_test) 
  
print "SVM Results\nConfusion Matrix:" 
print confusion_matrix(y_test.argmax(axis=1), svm_predictions) 
print "\n Classifcation Report"
print classification_report(y_test.argmax(axis=1),svm_predictions)

SVM Results
Confusion Matrix:
[[1451  209    1    0    0    0]
 [ 440  576   29    3    0    0]
 [  30  211   65    6    0    0]
 [   1   33   27   24    0    0]
 [   0    4    7   17    1    0]
 [   0    0    0    2    0    0]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.75      0.87      0.81      1661
          1       0.56      0.55      0.55      1048
          2       0.50      0.21      0.29       312
          3       0.46      0.28      0.35        85
          4       1.00      0.03      0.07        29
          5       0.00      0.00      0.00         2

avg / total       0.66      0.67      0.65      3137



  'precision', 'predicted', average, warn_for)


# Decision Tree

In [12]:
from sklearn import tree 
dtc = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
dtc.fit(X_train_scaled, y_train)
predicted = dtc.predict(X_test_scaled)


print "Decision Tree Results\nConfusion Matrix:"
print confusion_matrix(y_test.argmax(axis=1), predicted.argmax(axis=1))

print "\n Classifcation Report"
print classification_report(y_test,predicted)

Decision Tree Results
Confusion Matrix:
[[1540  121    0    0    0    0]
 [ 494  547    7    0    0    0]
 [ 119  177   16    0    0    0]
 [  44   26    6    9    0    0]
 [  16    5    1    7    0    0]
 [   0    0    0    2    0    0]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.83      0.77      0.80      1661
          1       0.62      0.52      0.57      1048
          2       0.53      0.05      0.09       312
          3       0.50      0.11      0.17        85
          4       0.00      0.00      0.00        29
          5       0.00      0.00      0.00         2

avg / total       0.71      0.59      0.63      3137



# Removing EF4 and EF5 from the data

In [13]:
frame2 = frame2[frame2.TOR_F_SCALE != "EF4"]
frame2 = frame2[frame2.TOR_F_SCALE != "EF5"]
print frame2

       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
52239                0                  0              0                0   
37149                0                  0              0                0   
8412                 0                  0              0                0   
59566                0                  0              0                0   
70432                0                  0              0                0   
28067                0                  0              0                0   
35483                0                  0              0                0   
68088                0                  0              0                0   
59942                0                  0              0                0   
31952                0                  0              0                0   
29813                0                  0              0                0   
58                   0                  0              0                0   

In [14]:
# randomize the dataframe
np.random.seed(47)
frame2 = frame2.reindex(np.random.permutation(frame2.index))

print frame2.head()
print frame2.tail()

       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
44756                0                  0              0                0   
32900                0                  0              0                0   
25651                0                  0              0                0   
42544                0                  0              0                0   
35546                0                  0              0                0   

      DAMAGE_PROPERTY DAMAGE_CROPS TOR_F_SCALE  TOR_LENGTH  TOR_WIDTH  
44756           0.00K        0.00K         EF0        2.50       50.0  
32900          30.00K        0.00K         EF0        1.19      100.0  
25651           0.00K        0.00K         EF2        9.00      250.0  
42544           0.00K        0.00K         EF0        4.88       80.0  
35546          10.00K        0.00K         EF0        0.10      100.0  
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
3597                 0      

In [15]:
# Make all NaN values in DAMAGE_PROPERTY and DAMAGE_CROPS columns set to 0.0K
frame2.DAMAGE_PROPERTY = frame2.DAMAGE_PROPERTY.fillna('0.0K')
frame2.DAMAGE_CROPS = frame2.DAMAGE_CROPS.fillna('0.0K')

# this function takes values such as 10.00K and makes it 10000.0
def value_to_float(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    if 'B' in x:
        return float(x.replace('B', '')) * 1000000000
    return 0.0

# convert all money values to float
frame2.DAMAGE_PROPERTY = frame2.DAMAGE_PROPERTY.apply(value_to_float)
frame2.DAMAGE_CROPS = frame2.DAMAGE_CROPS.apply(value_to_float)

# Just to be safe, make sure any additional NaN are set to 0
frame2 = frame2.fillna(0)

print frame2.head()
print frame2.tail()

       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
44756                0                  0              0                0   
32900                0                  0              0                0   
25651                0                  0              0                0   
42544                0                  0              0                0   
35546                0                  0              0                0   

       DAMAGE_PROPERTY  DAMAGE_CROPS TOR_F_SCALE  TOR_LENGTH  TOR_WIDTH  
44756              0.0           0.0         EF0        2.50       50.0  
32900          30000.0           0.0         EF0        1.19      100.0  
25651              0.0           0.0         EF2        9.00      250.0  
42544              0.0           0.0         EF0        4.88       80.0  
35546          10000.0           0.0         EF0        0.10      100.0  
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
3597            

In [16]:
#One hot encoding
y_orig = frame2['TOR_F_SCALE']
y = pd.get_dummies(frame2['TOR_F_SCALE']).as_matrix()
frame2 = frame2.drop('TOR_F_SCALE', axis=1)

print y.shape
print frame2.shape

print frame2.head()
print frame2.tail()
print y

(15568, 4)
(15568, 8)
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
44756                0                  0              0                0   
32900                0                  0              0                0   
25651                0                  0              0                0   
42544                0                  0              0                0   
35546                0                  0              0                0   

       DAMAGE_PROPERTY  DAMAGE_CROPS  TOR_LENGTH  TOR_WIDTH  
44756              0.0           0.0        2.50       50.0  
32900          30000.0           0.0        1.19      100.0  
25651              0.0           0.0        9.00      250.0  
42544              0.0           0.0        4.88       80.0  
35546          10000.0           0.0        0.10      100.0  
       INJURIES_DIRECT  INJURIES_INDIRECT  DEATHS_DIRECT  DEATHS_INDIRECT  \
3597                 0                  0              0          

In [17]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    frame2, y, test_size=0.2, random_state=50)
print X_train.shape
print y_train.shape

(12454, 8)
(12454, 4)


In [18]:
stdscaler = preprocessing.StandardScaler()
X_scaled  = stdscaler.fit_transform(frame2)
X_train_scaled = stdscaler.transform(X_train)
X_test_scaled  = stdscaler.transform(X_test)

# Naive Bayes classifier

In [19]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import classification_report, confusion_matrix
gnb = GaussianNB().fit(X_train_scaled, y_train.argmax(axis=1)) 
gnb_predictions = gnb.predict(X_test_scaled) 

print "Naive Bayes Classifier Results\nConfusion Matrix:"
print confusion_matrix(y_test.argmax(axis=1), gnb_predictions) 
print "\n Classifcation Report"
print classification_report(y_test.argmax(axis=1),gnb_predictions)

Naive Bayes Classifier Results
Confusion Matrix:
[[1646   15    4    3]
 [ 923   98   29   11]
 [ 133   86   61   11]
 [  16   29   29   20]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.61      0.99      0.75      1668
          1       0.43      0.09      0.15      1061
          2       0.50      0.21      0.29       291
          3       0.44      0.21      0.29        94

avg / total       0.53      0.59      0.49      3114



# Logistic Regression

In [20]:
from sklearn import linear_model
from sklearn import metrics
from mlclass2 import simplemetrics, plot_decision_2d_lda
from sklearn.decomposition import PCA

lr = linear_model.LogisticRegression()
lr.fit(X_train_scaled, y_train.argmax(axis=1))
predicted = lr.predict(X_test_scaled)

In [21]:
print "Logistic Regression Results\nConfusion Matrix:"
print confusion_matrix(y_test.argmax(axis=1), predicted)

print "\n Classifcation Report"
print classification_report(y_test.argmax(axis=1),predicted)

Logistic Regression Results
Confusion Matrix:
[[1565  103    0    0]
 [ 599  455    6    1]
 [  41  215   25   10]
 [   2   54   18   20]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.71      0.94      0.81      1668
          1       0.55      0.43      0.48      1061
          2       0.51      0.09      0.15       291
          3       0.65      0.21      0.32        94

avg / total       0.63      0.66      0.62      3114



# SVM

In [22]:
from sklearn.svm import SVC 
svm_model = SVC(kernel = 'rbf', gamma = 1, C = 1).fit(X_train_scaled, y_train.argmax(axis=1)) 
svm_predictions = svm_model.predict(X_test_scaled) 

In [23]:
from sklearn.metrics import hinge_loss
# model accuracy for X_test   
#accuracy = svm_model_linear.score(X_test, y_test) 
  
print "SVM Results\nConfusion Matrix:" 
print confusion_matrix(y_test.argmax(axis=1), svm_predictions) 
print "\n Classifcation Report"
print classification_report(y_test.argmax(axis=1),svm_predictions)

SVM Results
Confusion Matrix:
[[1461  202    5    0]
 [ 388  628   42    3]
 [  18  176   76   21]
 [   1   36   30   27]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.78      0.88      0.83      1668
          1       0.60      0.59      0.60      1061
          2       0.50      0.26      0.34       291
          3       0.53      0.29      0.37        94

avg / total       0.69      0.70      0.69      3114



# Decision Tree

In [24]:
from sklearn import tree 
dtc = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
dtc.fit(X_train_scaled, y_train)
predicted = dtc.predict(X_test_scaled)


print "Decision Tree Results\nConfusion Matrix:"
print confusion_matrix(y_test.argmax(axis=1), predicted.argmax(axis=1))

print "\n Classifcation Report"
print classification_report(y_test,predicted)

Decision Tree Results
Confusion Matrix:
[[1543  124    1    0]
 [ 516  510   33    2]
 [  77  134   63   17]
 [  24   20   26   24]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.87      0.71      0.78      1668
          1       0.65      0.48      0.55      1061
          2       0.51      0.22      0.30       291
          3       0.56      0.26      0.35        94

avg / total       0.75      0.57      0.65      3114

