## Flood predictor

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
rainfall=pd.read_csv("data_temp.csv")

In [3]:
rainfall.head()

Unnamed: 0,MONTH INDEX,RAINFALL,FLOOD
0,1,0.0,0
1,2,6.0,0
2,3,0.0,0
3,4,12.0,0
4,5,19.0,0


In [4]:
rainfall.describe()


Unnamed: 0,MONTH INDEX,RAINFALL,FLOOD
count,156.0,156.0,156.0
mean,6.5,107.414744,0.044872
std,3.46317,133.01099,0.207689
min,1.0,0.0,0.0
25%,3.75,3.5,0.0
50%,6.5,69.85,0.0
75%,9.25,166.025,0.0
max,12.0,995.6,1.0


In [5]:
rainfall.corr()

Unnamed: 0,MONTH INDEX,RAINFALL,FLOOD
MONTH INDEX,1.0,0.628935,0.309456
RAINFALL,0.628935,1.0,0.147785
FLOOD,0.309456,0.147785,1.0


In [6]:
# selecting all the features within our dataset
features = rainfall[['MONTH INDEX', 'RAINFALL']] 
features = features.to_numpy() # converting feature set to numpy array
target = rainfall['FLOOD'].to_numpy() # converting target column to numpy array
features.shape, len(target) 

((156, 2), 156)

In [7]:
##from sklearn.model_selection import train_test_split

##featureTrain,featureTest,targetTrain,targetTest= train_test_split(features,target,test_size=0.3)


In [8]:
##from sklearn.model_selection import StratifiedShuffleSplit
##split=StratifiedShuffleSplit()
##for train_index,test_index in split.split(rainfall,rainfall['FLOOD'])

In [9]:
##from sklearn.linear_model import LogisticRegression
##from sklearn.metrics import confusion_matrix
##from sklearn.metrics import accuracy_score

##model= LogisticRegression()
##fittedModel=model.fit(featureTrain,targetTrain)
##predictions= fittedModel.predict(featureTest)



##print(confusion_matrix(targetTest,predictions))

In [10]:
##print(accuracy_score(targetTest,predictions)*100)

In [11]:
# define input

##new_input = [[7,309]]

##example=model.predict(new_input)
##print(example)

In [12]:
# function for standardizing data
def standardScaler(feature_array):
  
    total_cols = feature_array.shape[1] # total number of columns 
    for i in range(total_cols): # iterating through each column
        feature_col = feature_array[:, i]
        mean = int(feature_col.mean()) # mean stores mean value for the column
        std = feature_col.std() # std stores standard deviation value for the column
        feature_array[:, i] = (feature_array[:, i] - mean) / std # standard scaling of each element of the column


In [13]:
standardScaler(features) # performing standardization on our feature set 

# checking if standardization worked
total_cols = features.shape[1] # total number of columns 
for i in range(total_cols):
    print(features[:, i].std())

1.0
1.0


In [14]:
# creating randomized weights for our linear predictor func
weights = np.random.rand(2, 2)
# creating randomized biases for our linear predictor func
biases = np.random.rand(2, 1)

In [15]:
def linearPredict(featureMat, weights, biases):
    """This is the linear predictor function for out MLR model. It calculates the logit scores for each possible outcome.
    
    Args-
        featureMat- A numpy array of features
        weights- A numpy array of weights for our model
        biases- A numpy array of biases for our model
    
    Returns-
        logitScores- Logit scores for each possible outcome of the target variable for each feature set in the feature matrix
    """
    logitScores = np.array([np.empty([2]) for i in range(featureMat.shape[0])]) # creating empty(garbage value) array for each feature set
    
    for i in range(featureMat.shape[0]): # iterating through each feature set
        logitScores[i] = (weights.dot(featureMat[i].reshape(-1,1)) + biases).reshape(-1) # calculates logit score for each feature set then flattens the logit vector 
    
    return logitScores

In [16]:
features = rainfall[['MONTH INDEX', 'RAINFALL']]
features = features.to_numpy() # converts feature set to numpy array
logitTest = linearPredict(features, weights, biases)
logitTest.shape


(156, 2)

In [17]:
def softmaxNormalizer(logitMatrix):
    """Converts logit scores for each possible outcome to probability values.
    
    Args-
        logitMatrix - This is the output of our logitPredict function; consists  logit scores for each feature set
    
    Returns-
        probabilities - Probability value of each outcome for each feature set
    """
    
    probabilities = np.array([np.empty([2]) for i in range(logitMatrix.shape[0])]) # creating empty(garbage value) array for each feature set

    for i in range(logitMatrix.shape[0]):
        exp = np.exp(logitMatrix[i]) # exponentiates each element of the logit array
        sumOfArr = np.sum(exp) # adds up all the values in the exponentiated array
        probabilities[i] = exp/sumOfArr # logit scores to probability values
    return probabilities

In [18]:
def multinomialLogReg(features, weights, biases):
    """Performs logistic regression on a given feature set.
    
    Args- 
        features- Numpy array of features(standardized)
        weights- A numpy array of weights for our model
        biases- A numpy array of biases for our model
    
    Returns-
        probabilities, predictions
        Here,
            probabilities: Probability values for each possible outcome for each feature set in the feature matrix
            predictions: Outcome with max probability for each feature set
    """
    logitScores = linearPredict(features, weights, biases)
    probabilities = softmaxNormalizer(logitScores)
    predictions = np.array([np.argmax(i) for i in probabilities]) #returns the outcome with max probability
    return probabilities, predictions

In [19]:
probabilities, predictions = multinomialLogReg(features, weights, biases)
print(probabilities.shape)
print(predictions)

(156, 2)
[0 0 0 0 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 1 0
 0 0 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 1
 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0
 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0
 0 0 1 1 1 1 1 1]


  exp = np.exp(logitMatrix[i]) # exponentiates each element of the logit array
  probabilities[i] = exp/sumOfArr # logit scores to probability values


In [20]:
def accuracy(predictions, target):
    """Calculates total accuracy for our model.
    
    Args- 
        predictions- Predicted target outcomes as predicted by our MLR function
        target- Actual target values
    
    Returns-
        accuracy- Accuracy percentage of our model
    """
    correctPred = 0
    for i in range(len(predictions)):
        if predictions[i] == target[i]:
            correctPred += 1
    accuracy = correctPred/len(predictions)*100
    return accuracy

accuracy = accuracy(predictions, target) #calculating accuracy for our model
print(accuracy)

39.1025641025641


In [21]:
def train_test_split(dataframe, test_size = 0.2):
    """Splits dataset into training and testing sets.
    
    Args- 
        dataframe- The dataframe object you want to split
        test_size- Size of test dataset that you want
    
    Returns-
        train_features, train_target, test_features, test_target 
    """
    
    data = dataframe.to_numpy() # converts dataframe to numpy array
    totalRows = data.shape[0] # total rows in the dataset
    testRows = np.round(totalRows * test_size) # total rows in testing dataset
    randRowNum = np.random.randint(0, int(totalRows), int(testRows)) # randomly generated row numbers
    testData = np.array([data[i] for i in randRowNum]) # creates test dataset
    data = np.delete(data, randRowNum, axis = 0) # deletes test data rows from main dataset; making it training dataset
    train_features = data[:, :-1]
    train_target = data[:, -1]
    test_features = testData[:, :-1]
    test_target = testData[:, -1]
    
    return train_features, train_target, test_features, test_target    

# running train_test_split for our dataset
train_features, train_target, test_features, test_target = train_test_split(rainfall, test_size = 0.17)
standardScaler(train_features) # standard scaling training set 
standardScaler(test_features) # standard scaling testing set
train_features.shape, train_target.shape, test_features.shape, test_target.shape

((130, 2), (130,), (27, 2), (27,))

In [22]:
def crossEntropyLoss(probabilities, target):
    """Calculates cross entropy loss for a set of predictions and actual targets.
    
    Args-
        predictions- Probability predictions, as returned by multinomialLogReg function
        target- Actual target values
    Returns- 
        CELoss- Average cross entropy loss
    """
    n_samples = probabilities.shape[0]
    CELoss = 0
    for sample, i in zip(probabilities, target):
        CELoss += -np.log(sample[i])
    CELoss /= n_samples
    return CELoss   

In [23]:
def stochGradDes(learning_rate, epochs, target, features, weights, biases):
    """Performs stochastic gradient descent optimization on the model.
    
    Args-
        learning_rate- Size of the step the function will take during optimization
        epochs- No. of iterations the function will run for on the model
        target- Numpy array containing actual target values
        features- Numpy array of independent variables
        weights- Numpy array containing weights associated with each feature
        biases- Array containinig model biases
    
    Returns-
        weights, biases, loss_list
        where,
            weights- Latest weight calculated (Numpy array)
            bias- Latest bias calculated (Numpy array)
            loss_list- Array containing list of losses observed after each epoch    
    """
    target = target.astype(int)
    loss_list = np.array([]) #initiating an empty array
    
    for i in range(epochs):
        probabilities, _ = multinomialLogReg(features, weights, biases) # Calculates probabilities for each possible outcome
        
        CELoss = crossEntropyLoss(probabilities, target) # Calculates cross entropy loss for actual target and predictions
        loss_list = np.append(loss_list, CELoss) # Adds the CELoss value for the epoch to loss_list
        
        probabilities[np.arange(features.shape[0]),target] -= 1 # Substract 1 from the scores of the correct outcome
        
        grad_weight = probabilities.T.dot(features) # gradient of loss w.r.t. weights
        grad_biases = np.sum(probabilities, axis = 0).reshape(-1,1) # gradient of loss w.r.t. biases
        
        #updating weights and biases
        weights -= (learning_rate * grad_weight)
        biases -= (learning_rate * grad_biases)
        
    return weights, biases, loss_list

In [24]:
updatedWeights, updatedBiases, loss_list = stochGradDes(0.1, 2000, train_target, train_features, weights, biases)


In [25]:
testProbabilities, testPredictions = multinomialLogReg(test_features, weights, biases)

correctPreds = 0
for i in range(len(testPredictions)):
    if testPredictions[i] == test_target[i]:
        correctPreds += 1
acc = correctPreds / len(testPredictions) * 100
print("Model accuracy on test dataset - {}".format(acc))

Model accuracy on test dataset - 92.5925925925926
