In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import scipy.stats as stats

In [None]:
# Load the dataset
weather_data = pd.read_csv('/kaggle/input/weather-type-classification/weather_classification_data.csv')

In [None]:
#G
weather_data.info()

In [None]:
# Display the first few rows of the dataset
print(weather_data.head())

In [None]:
# Handle missing values (if any)
weather_data.ffill(axis = 0)

In [None]:
# Encode categorical variables
label_encoders = {}
for column in ['Cloud Cover', 'Season', 'Location', 'Weather Type']:
    le = LabelEncoder()
    weather_data[column] = le.fit_transform(weather_data[column])
    label_encoders[column] = le

In [None]:
# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Temperature', 'Wind Speed', 'Precipitation (%)', 'Atmospheric Pressure', 'Visibility (km)', 'Humidity', 'UV Index']
weather_data[numerical_features] = scaler.fit_transform(weather_data[numerical_features])

In [None]:
# Compute the correlation matrix
correlation_matrix = weather_data.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="crest", vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Define features and target variable
X = weather_data.drop('Temperature', axis=1)  # Features
y = weather_data['Temperature']               # Target variable

In [None]:
#Split the dataset into training and testing sets.
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train a RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate the regressor
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

In [None]:
# Get feature importances from the trained Random Forest model
feature_importances = regressor.feature_importances_
feature_names = X.columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a DataFrame for easy plotting
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance values
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()