In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

  import pandas.util.testing as tm


# Get Data

In [2]:
data = pd.read_csv("cleaned.csv", index_col = 0)

In [3]:
data = data.replace({
    'Fire' : {
        True : 1,
        False : 0
    }
})

In [5]:
all_features = list(data.columns)[6:21] + list(data.columns)[21:-1]
target = ['Fire']

# Model Training



In [107]:
class ModelsClass:
  def __init__(self, X, y):
    self.X = data[X].values
    self.y = data[y].values
    print(X)

  def logisticReg(self):
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.33)

    logreg = LogisticRegression(random_state=20191016, solver='lbfgs', max_iter= 1000)

    logreg.fit(X_train, y_train.ravel())
    self.score(X_test, y_test, logreg)

  def logisticRegNormal(self):
    scaler = StandardScaler()
    scaled_train_features = scaler.fit_transform(self.X)

    X_train, X_test, y_train, y_test = train_test_split(scaled_train_features, self.y, test_size=0.33)

    logreg = LogisticRegression(random_state=20191016, solver='lbfgs', max_iter= 1000)

    logreg.fit(X_train, y_train.ravel())
    self.score(X_test, y_test, logreg)
  
  def RandomForest(self):
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.33)

    random = RandomForestClassifier()

    random.fit(X_train, y_train.ravel())
    self.score(X_test, y_test, random)

  def svc(self):
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.33)

    svc = SVC()

    svc.fit(X_train, y_train.ravel())
    self.score(X_test, y_test, svc)

  def score(self, a, b, model):
    j = 0.000
    for i in range(50):
      j += model.score(a, b)
    j = j / 50
    print("The average score after running 50 times is : {avg:.2f}%".format(avg = (j * 100)))

In [87]:
a = ModelsClass(all_features, target)

In [88]:
a.logisticReg()

The average score after running 50 times is : 55.81%


*Let's try normalizing the data and see if the accuracy changes.*

In [89]:
a.logisticRegNormal()

The average score after running 50 times is : 56.83%


**The accuracy seems to be the same. So let's maybe try a different model and see the result**

*Random Forest Classifier*

In [90]:
a.RandomForest()

The average score after running 50 times is : 57.35%


*Support Vector Classifier*

In [91]:
a.svc()

The average score after running 50 times is : 57.04%


**The accuracy did not change even when we tried different models. This means that we need to modify our features.** \\
*Note that if we increase the training dataset size the accuracy of models like the SVC does increase*

## Approach #1

*Instead of using all the value of the features let us look at the averages.*

In [96]:
data['tmp avg'] = (data["Temp Day One"] + data['Temp Day Two'] + data['Temp Day Three'] + data['Temp Day Four'] 
                   + data['Temp Day Five'])/5

data['tmpMax avg'] = (data["MaxTemp Day One"] + data['MaxTemp Day Two'] + data['MaxTemp Day Three'] + data['MaxTemp Day Four'] 
                   + data['MaxTemp Day Five'])/5

data['tmpMin avg'] = (data["MinTemp Day One"] + data['MinTemp Day Two'] + data['MinTemp Day Three'] + data['MinTemp Day Four'] 
                   + data['MinTemp Day Five'])/5


In [97]:
data['hum avg'] = (data["Humidity Day One"] + data['Humidity Day Two'] + data['Humidity Day Three'] + data['Humidity Day Four'] + data['Humidity Day Five'])/5

In [103]:
all_features = list(data.columns)[27:-2]

In [102]:
list(data.columns)[27: -2]

['hum avg', 'tmp avg']

In [108]:
b = ModelsClass(all_features, target)

['hum avg', 'tmp avg']


In [110]:
b.logisticReg()

The average score after running 50 times is : 56.42%


In [111]:
b.logisticRegNormal()

The average score after running 50 times is : 53.44%


In [112]:
b.RandomForest()

The average score after running 50 times is : 54.06%


In [113]:
b.svc()

The average score after running 50 times is : 55.40%


*Using the averages does not change anything*