In [37]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load data from the CSV file
with open('Bangalore Weather Data (Visual Crossing Weather).csv', mode='r', newline='', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    data = list(reader)

# Convert data into features (X) and target (y)
X = []
y = []
for row in data:
    # Handle missing and empty values
    temperature = float(row['Temperature']) if row['Temperature'] != 'None' else 0.0
    min_temperature = float(row['Minimum Temperature']) if row['Minimum Temperature'] != 'None' else 0.0
    max_temperature = float(row['Maximum Temperature']) if row['Maximum Temperature'] != 'None' else 0.0
    wind_speed = float(row['Wind Speed']) if row['Wind Speed'] != 'None' else 0.0
    sea_level_pressure = float(row['Sea Level Pressure']) if row['Sea Level Pressure'] != 'None' and row['Sea Level Pressure'] != '' else 0.0
    
    features = [temperature, min_temperature, max_temperature, wind_speed, sea_level_pressure]
    target = float(row['Precipitation']) if row['Precipitation'] != 'None' else 0.0
    
    X.append(features)
    y.append(target)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate mean squared error
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"Mean Squared Error (Train): {mse_train}")
print(f"Mean Squared Error (Test): {mse_test}") 


Mean Squared Error (Train): 0.007300175045369603
Mean Squared Error (Test): 0.01728028509637043


In [38]:
from sklearn.metrics import r2_score

# Calculate R^2 score for train and test sets
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"R^2 Score (Train): {r2_train}")
print(f"R^2 Score (Test): {r2_test}")


R^2 Score (Train): 0.018549906576572073
R^2 Score (Test): 0.021361685786716933


In [39]:
print(len(X_test))



1339


In [40]:
# Select 5 random indices from the test set
import random
random.seed(42)
indices = random.sample(range(len(X_test)), 5)

# Predict rainfall for the selected days
predicted_rainfall = model.predict(X_test[indices])
actual_rainfall = y_test[indices]

# Set threshold for rainfall prediction (in mm)
threshold = 0.1  # Example threshold value, adjust as needed

# Convert predicted and actual rainfall to binary labels
predicted_labels = ['Yes' if rainfall > threshold else 'No' for rainfall in predicted_rainfall]
actual_labels = ['Yes' if rainfall > threshold else 'No' for rainfall in actual_rainfall]

# Compare predicted and actual labels
print("Comparison of Predicted and Actual Rainfall (Binary Prediction):")
print("Day\t\tPredicted\tActual")
for i, idx in enumerate(indices):
    print(f"{i+1}\t\t{predicted_labels[i]}\t\t{actual_labels[i]}")


Comparison of Predicted and Actual Rainfall (Binary Prediction):
Day		Predicted	Actual
1		No		No
2		No		No
3		No		No
4		No		No
5		No		No


In [41]:
# Select 100 random indices from the test set
random.seed(42)
indices = random.sample(range(len(X_test)), 100)

# Predict rainfall for the selected days
predicted_rainfall = model.predict(X_test[indices])

# Convert predicted rainfall to binary labels based on the threshold
predicted_labels = ['Yes' if pred > threshold else 'No' for pred in predicted_rainfall]
actual_labels = ['Yes' if actual > threshold else 'No' for actual in y_test[indices]]

# Calculate accuracy
correct_predictions = sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == actual)
accuracy = correct_predictions / len(indices)

print(f"Accuracy of predictions over 100 random days: {accuracy * 100:.2f}%")


Accuracy of predictions over 100 random days: 93.00%
