In [14]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load data from the CSV file
with open('rainfall_data.csv', mode='r', newline='', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    data = list(reader)

# Convert data into features (X) and target (y)
X = []
y = []
for row in data:
    # Skip rows with missing values
    if row['tavg'] == 'None' or row['tmin'] == 'None' or row['tmax'] == 'None' or row['wspd'] == 'None' or row['pres'] == 'None':
        continue
    # Extract features and target
    features = [float(row['tavg']), float(row['tmin']), float(row['tmax']), float(row['wspd']), float(row['pres'])]
    target = float(row['prcp'])
    X.append(features)
    y.append(target)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate mean squared error
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"Mean Squared Error (Train): {mse_train}")
print(f"Mean Squared Error (Test): {mse_test}")


Mean Squared Error (Train): 135.83660020134994
Mean Squared Error (Test): 92.0895123000136


In [15]:
from sklearn.metrics import r2_score

# Calculate R^2 score for train and test sets
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"R^2 Score (Train): {r2_train}")
print(f"R^2 Score (Test): {r2_test}")


R^2 Score (Train): 0.12703172287605324
R^2 Score (Test): 0.2052466659579295


In [16]:
# Select 5 random indices from the test set
import random
random.seed(42)
indices = random.sample(range(len(X_test)), 5)

# Predict rainfall for the selected days
predicted_rainfall = model.predict(X_test[indices])
actual_rainfall = y_test[indices]

# Compare predicted and actual values
print("Comparison of Predicted and Actual Rainfall:")
print("Day\t\tPredicted (mm)\tActual (mm)\tDifference (mm)")
for i, idx in enumerate(indices):
    print(f"{i+1}\t\t{predicted_rainfall[i]:.2f}\t\t{actual_rainfall[i]:.2f}\t\t{abs(predicted_rainfall[i] - actual_rainfall[i]):.2f}")


Comparison of Predicted and Actual Rainfall:
Day		Predicted (mm)	Actual (mm)	Difference (mm)
1		10.55		7.90		2.65
2		4.69		0.00		4.69
3		20.26		39.60		19.34
4		5.29		3.80		1.49
5		-0.07		0.00		0.07


In [17]:
# Define a threshold for classifying rainfall
threshold = 0.1  # mm

# Predict rainfall for the selected days
predicted_rainfall = model.predict(X_test[indices])

# Convert predicted rainfall to binary labels based on the threshold
predicted_labels = ['Yes' if pred > threshold else 'No' for pred in predicted_rainfall]
actual_labels = ['Yes' if actual > threshold else 'No' for actual in actual_rainfall]

# Compare predicted and actual labels
print("Comparison of Predicted and Actual Rainfall:")
print("Day\t\tPredicted\tActual")
for i, idx in enumerate(indices):
    print(f"{i+1}\t\t{predicted_labels[i]}\t\t{actual_labels[i]}")


Comparison of Predicted and Actual Rainfall:
Day		Predicted	Actual
1		Yes		Yes
2		Yes		No
3		Yes		Yes
4		Yes		Yes
5		No		No


In [18]:
print(len(X_test))



219


In [19]:
# Select 100 random indices from the test set
random.seed(42)
indices = random.sample(range(len(X_test)), 100)

# Predict rainfall for the selected days
predicted_rainfall = model.predict(X_test[indices])

# Convert predicted rainfall to binary labels based on the threshold
predicted_labels = ['Yes' if pred > threshold else 'No' for pred in predicted_rainfall]
actual_labels = ['Yes' if actual > threshold else 'No' for actual in y_test[indices]]

# Calculate accuracy
correct_predictions = sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == actual)
accuracy = correct_predictions / len(indices)

print(f"Accuracy of predictions over 100 random days: {accuracy * 100:.2f}%")


Accuracy of predictions over 100 random days: 64.00%
