In [None]:
# Importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# Reading and exploring the training data
train_data = pd.read_csv('/content/train_dataset.csv')
print(train_data.head())

print(train_data.describe())

print(train_data.isna().sum())

train_data_cleaned = train_data.dropna()
print(train_data_cleaned.describe())

# Visualizing data relationships
sns.pairplot(train_data_cleaned)
plt.show()

sns.heatmap(train_data_cleaned.corr(), annot=True)
plt.show()

# Preparing data for machine learning
X_train = train_data_cleaned.drop(columns='Next_Tmax')
y_train = train_data_cleaned['Next_Tmax']

X_train_scaled = MinMaxScaler().fit_transform(X_train)

# Comparing different models
models = {
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'SVR': SVR(),
    'XGBRegressor': XGBRegressor(),
    'CatBoostRegressor': CatBoostRegressor(verbose=0),
    'LGBMRegressor': LGBMRegressor()
}

results = {}

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    results[name] = np.mean(np.sqrt(-cv_scores))

print("Model Performance (Lower RMSE is better):")
for name, score in results.items():
    print(f"{name}: {score}")

# Selecting the best model (CatBoostRegressor) for final predictions
best_model = CatBoostRegressor(verbose=0)
best_model.fit(X_train_scaled, y_train)

# Reading and preparing the test dataset
test_data = pd.read_csv('/content/test_dataset.csv')
print(test_data.head())

print(test_data.describe())

# Ensure test data matches the format of training data
test_data_scaled = MinMaxScaler().fit_transform(test_data)

# Making predictions on the test dataset
test_predictions = best_model.predict(test_data_scaled)

# Assuming a threshold of 40°C for heat wave warning
heat_wave_threshold = 40

test_warnings = test_predictions > heat_wave_threshold
warning_areas = test_data[test_warnings]

print(f"Number of areas with predicted heat wave: {np.sum(test_warnings)}")

# Optionally, you can print out the specific areas
print("Areas with predicted heat wave:")
print(warning_areas)

# To add a message indicating a heat wave warning
if np.sum(test_warnings) > 0:
    print("Warning: Heat wave predicted in the following areas:")
    print(warning_areas)
else:
    print("No heat wave predicted.")


Output hidden; open in https://colab.research.google.com to view.