In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data_file = pd.read_csv('/kaggle/input/kerela-flood/kerala.csv')

In [3]:
data_file.head()

Unnamed: 0,SUBDIVISION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL RAINFALL,FLOODS
0,KERALA,1901,28.7,44.7,51.6,160.0,174.7,824.6,743.0,357.5,197.7,266.9,350.8,48.4,3248.6,YES
1,KERALA,1902,6.7,2.6,57.3,83.9,134.5,390.9,1205.0,315.8,491.6,358.4,158.3,121.5,3326.6,YES
2,KERALA,1903,3.2,18.6,3.1,83.6,249.7,558.6,1022.5,420.2,341.8,354.1,157.0,59.0,3271.2,YES
3,KERALA,1904,23.7,3.0,32.2,71.5,235.7,1098.2,725.5,351.8,222.7,328.1,33.9,3.3,3129.7,YES
4,KERALA,1905,1.2,22.3,9.4,105.9,263.3,850.2,520.5,293.6,217.2,383.5,74.4,0.2,2741.6,NO


In [4]:
def refined_df(setx):
    def refine(data):
        encode = LabelEncoder()
        data['FLOODS'] = encode.fit_transform(data['FLOODS'])

        dropif = data['SUBDIVISION'].nunique()
        if dropif == 2:
            data = data.drop('SUBDIVISION',axis=1)

        column_names = data.columns

        scale = StandardScaler()
        data = scale.fit_transform(data)

        scaled_df = pd.DataFrame(data, columns=column_names)

        return data,column_names

    dfx,column_names = refine(setx)
    dfx = pd.DataFrame(dfx,columns=column_names)
    

    return dfx

In [5]:
df = refined_df(data_file)
df.head()

Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL RAINFALL,FLOODS
0,-1.717434,1.069658,1.779199,0.498716,1.117577,-0.367166,0.933068,0.196388,-0.402132,-0.399623,-0.281946,2.27514,0.229752,0.717813,0.983192
1,-1.688076,-0.358166,-0.797833,0.689121,-0.594693,-0.640781,-1.406315,2.222556,-0.632255,2.021629,0.698684,-0.048415,2.231363,0.89105,0.983192
2,-1.658718,-0.58532,0.181561,-1.121396,-0.601443,0.143308,-0.501739,1.422176,-0.056121,0.787524,0.6526,-0.064106,0.519999,0.768008,0.983192
3,-1.629361,0.745153,-0.773348,-0.149329,-0.873697,0.048019,2.40887,0.119639,-0.433588,-0.193664,0.373951,-1.549974,-1.005168,0.453737,0.983192
4,-1.600003,-0.715122,0.408046,-0.910949,-0.099688,0.235874,1.071155,-0.779418,-0.754766,-0.238975,0.967688,-1.061122,-1.090051,-0.40823,-1.017095


In [6]:
def random_forest_regression(data, target_column, test_size=0.2, random_state=42, n_estimators=100):

    # Separate features and target variable
    X = data.drop(columns='FLOODS')
    y = data['FLOODS']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize the Random Forest Regressor model
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = model.predict(X_test)

    # Calculate mean squared error and R-squared score
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Return predictions, actual values, and model performance metrics
    return predictions, y_test.values, mse, r2, model


In [7]:
# predictions, actual_values, mse, r2, model = random_forest_regression(data, 'target')
predictions, actual_values, mse, r2, model = random_forest_regression(df, df['FLOODS'])

In [8]:
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

Mean Squared Error: 0.0003000862068965473
R-squared Score: 0.9996914285714286
