In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the datasets
file_path1 = 'Daily_Rainfall_data_from_India_Meteorological_Department_Agency_during_January_2024.csv'
file_path2 = 'district wise rainfall normal.csv'
file_path3 = 'rainfall in india 1901-2015.csv'

daily_rainfall_data = pd.read_csv(file_path1)
monthly_rainfall_data = pd.read_csv(file_path2)
historical_rainfall_data = pd.read_csv(file_path3)

historical_rainfall_data.head()


Unnamed: 0,SUBDIVISION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec
0,ANDAMAN & NICOBAR ISLANDS,1901,49.2,87.1,29.2,2.3,528.8,517.5,365.1,481.1,332.6,388.5,558.2,33.6,3373.2,136.3,560.3,1696.3,980.3
1,ANDAMAN & NICOBAR ISLANDS,1902,0.0,159.8,12.2,0.0,446.1,537.1,228.9,753.7,666.2,197.2,359.0,160.5,3520.7,159.8,458.3,2185.9,716.7
2,ANDAMAN & NICOBAR ISLANDS,1903,12.7,144.0,0.0,1.0,235.1,479.9,728.4,326.7,339.0,181.2,284.4,225.0,2957.4,156.7,236.1,1874.0,690.6
3,ANDAMAN & NICOBAR ISLANDS,1904,9.4,14.7,0.0,202.4,304.5,495.1,502.0,160.1,820.4,222.2,308.7,40.1,3079.6,24.1,506.9,1977.6,571.0
4,ANDAMAN & NICOBAR ISLANDS,1905,1.3,0.0,3.3,26.9,279.5,628.7,368.7,330.5,297.0,260.7,25.4,344.7,2566.7,1.3,309.7,1624.9,630.8


In [10]:
# Convert monthly data to long format
monthly_long = pd.melt(monthly_rainfall_data,
                       id_vars=['STATE_UT_NAME', 'DISTRICT'],
                       var_name='Month',
                       value_name='Monthly_Rainfall')


# Convert historical data to long format
historical_long = pd.melt(historical_rainfall_data,
                           id_vars=['SUBDIVISION', 'YEAR'],
                           var_name='Month',
                           value_name='Historical_Rainfall')

# Inspect the transformed data
historical_long.head()


Unnamed: 0,SUBDIVISION,YEAR,Month,Historical_Rainfall
0,ANDAMAN & NICOBAR ISLANDS,1901,JAN,49.2
1,ANDAMAN & NICOBAR ISLANDS,1902,JAN,0.0
2,ANDAMAN & NICOBAR ISLANDS,1903,JAN,12.7
3,ANDAMAN & NICOBAR ISLANDS,1904,JAN,9.4
4,ANDAMAN & NICOBAR ISLANDS,1905,JAN,1.3


In [11]:
# Convert 'Date' column to datetime format and extract month
daily_rainfall_data['Date'] = pd.to_datetime(daily_rainfall_data['Date'])
daily_rainfall_data['Month'] = daily_rainfall_data['Date'].dt.month_name().str[:3].str.upper()

In [12]:
# Extract day of the month
daily_rainfall_data['Day'] = daily_rainfall_data['Date'].dt.day

In [13]:

# Merge daily data with monthly data
combined_data = pd.merge(daily_rainfall_data, monthly_long,
                         left_on=['State', 'District', 'Month'],
                         right_on=['STATE_UT_NAME', 'DISTRICT', 'Month'],
                         how='left')

# Merge with historical rainfall data
combined_data = pd.merge(combined_data, historical_long,
                         left_on=['STATE_UT_NAME', 'Month'],
                         right_on=['SUBDIVISION', 'Month'],
                         how='left')

# Drop or rename columns as needed
combined_data.drop(columns=['SUBDIVISION'], inplace=True)

In [14]:
# Handle missing values in the combined dataset
combined_data['Monthly_Rainfall'] = combined_data['Monthly_Rainfall'].fillna(0)
combined_data['Avg_rainfall'] = combined_data['Avg_rainfall'].fillna(0)
combined_data['Historical_Rainfall'] = combined_data['Historical_Rainfall'].fillna(0)

In [15]:
# Define the features (X) and the target (y)
X = combined_data[['Day', 'Monthly_Rainfall', 'Historical_Rainfall']]
y = combined_data['Avg_rainfall']

In [16]:
# Check for NaN values in y
if y.isna().any():
    print("Warning: Target variable 'y' contains NaN values. Check your data.")
    y = y.fillna(0)

In [17]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [19]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [20]:

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

In [21]:

# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error (MAE): 0.0704471890991986
Root Mean Squared Error (RMSE): 0.222153122856489
R^2 Score: 0.014716965548670302


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the datasets
file_path1 = 'Daily_Rainfall_data_from_India_Meteorological_Department_Agency_during_January_2024.csv'
file_path2 = 'district wise rainfall normal.csv'
file_path3 = 'rainfall in india 1901-2015.csv'

daily_rainfall_data = pd.read_csv(file_path1)
monthly_rainfall_data = pd.read_csv(file_path2)
historical_rainfall_data = pd.read_csv(file_path3)

# Convert historical data to long format
historical_long = pd.melt(historical_rainfall_data,
                           id_vars=['SUBDIVISION', 'YEAR'],
                           var_name='Month',
                           value_name='Historical_Rainfall')

# Convert monthly data to long format
monthly_long = pd.melt(monthly_rainfall_data,
                       id_vars=['STATE_UT_NAME', 'DISTRICT'],
                       var_name='Month',
                       value_name='Monthly_Rainfall')

# Convert 'Date' column to datetime format and extract month
daily_rainfall_data['Date'] = pd.to_datetime(daily_rainfall_data['Date'])
daily_rainfall_data['Month'] = daily_rainfall_data['Date'].dt.month_name().str[:3].str.upper()
daily_rainfall_data['Day'] = daily_rainfall_data['Date'].dt.day

# Standardize column names for merging
daily_rainfall_data.rename(columns={'State': 'STATE_UT_NAME'}, inplace=True)

# Merge daily data with monthly data
combined_data = pd.merge(daily_rainfall_data, monthly_long,
                         left_on=['STATE_UT_NAME', 'District', 'Month'],
                         right_on=['STATE_UT_NAME', 'DISTRICT', 'Month'],
                         how='left')

# Merge with historical rainfall data
combined_data = pd.merge(combined_data, historical_long,
                         left_on=['STATE_UT_NAME', 'Month'],
                         right_on=['SUBDIVISION', 'Month'],
                         how='left')

# Drop or rename columns as needed
combined_data.drop(columns=['SUBDIVISION'], inplace=True)

# Handle missing values in the combined dataset
combined_data['Monthly_Rainfall'] = combined_data['Monthly_Rainfall'].fillna(0)
combined_data['Avg_rainfall'] = combined_data['Avg_rainfall'].fillna(0)
combined_data['Historical_Rainfall'] = combined_data['Historical_Rainfall'].fillna(0)

# Define the features (X) and the target (y)
X = combined_data[['Day', 'Monthly_Rainfall', 'Historical_Rainfall']]
y = combined_data['Avg_rainfall']

# Check for NaN values in y
if y.isna().any():
    print("Warning: Target variable 'y' contains NaN values. Check your data.")
    y = y.fillna(0)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R^2 Score: {r2}')


Mean Absolute Error (MAE): 0.0704471890991986
Root Mean Squared Error (RMSE): 0.222153122856489
R^2 Score: 0.014716965548670302
