In [None]:
# Install necessary libraries
!pip install pandas numpy matplotlib seaborn scikit-learn openpyxl imbalanced-learn xgboost

In [None]:
# Import essential libraries
import pandas as pd       
import numpy as np        
import matplotlib.pyplot as plt  
import seaborn as sns     
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler      
from sklearn.impute import SimpleImputer
import imblearn

In [None]:
df = pd.read_csv('weather_data.csv')
df.head()

In [None]:
print(df.dtypes)

In [None]:
df.shape

# Data Cleaning

In [None]:
# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
# Instead of dropping, impute missing values with column means
df['avg_temperature'] = df['avg_temperature'].fillna(df['avg_temperature'].mean())
df['humidity'] = df['humidity'].fillna(df['humidity'].mean())
df['avg_wind_speed'] = df['avg_wind_speed'].fillna(df['avg_wind_speed'].mean())
df['cloud_cover'] = df['cloud_cover'].fillna(df['cloud_cover'].mean())

In [None]:
# Check for remaining missing values
print("Missing values after imputing:\n", df.isnull().sum())

In [None]:
# Check unique values in 'rain_or_not'
print(df['rain_or_not'].unique())

In [None]:
# Encode 'rain_or_not': 'Rain' -> 1, 'No Rain' -> 0
df['rain_or_not'] = df['rain_or_not'].map({'Rain': 1, 'No Rain': 0})

In [None]:
# Check unique values after encoding
print(df['rain_or_not'].unique())

# Check for NaN values after encoding
print(df['rain_or_not'].isnull().sum())

In [None]:
# Features to check for negative values
features_to_check = ['humidity', 'avg_wind_speed', 'cloud_cover', 'pressure']

# Count negative values in each feature
for feature in features_to_check:
    negative_count = (df[feature] < 0).sum()
    print(f"Number of negative values in {feature}: {negative_count}")

In [None]:
df.sample(n=10)

In [None]:
features_to_check_outliers = ['humidity', 'avg_wind_speed', 'cloud_cover', 'pressure', 'avg_temperature']

for feature in features_to_check_outliers:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[feature])
    plt.title(f'Boxplot of {feature}')
    plt.show()

In [None]:
# Filter rows where avg_wind_speed is an outlier
outliers = df[df['avg_wind_speed'] > 30]  # Adjust threshold based on your whisker
print(outliers[['avg_wind_speed', 'rain_or_not']])

# EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count plot for 'rain_or_not'
sns.countplot(x='rain_or_not', data=df)
plt.title('Distribution of Rain vs. No Rain')
plt.xlabel('Rain (1) or No Rain (0)')
plt.ylabel('Count')
plt.show()

In [None]:
# List of numerical features
numerical_features = ['avg_temperature', 'humidity', 'avg_wind_speed', 'cloud_cover', 'pressure']

# Create histograms for each numerical feature
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Create box plots for each numerical feature grouped by 'rain_or_not'
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x='rain_or_not', y=feature, data=df)
    plt.title(f'{feature} vs. Rain or Not')
    plt.xlabel('Rain (1) or No Rain (0)')
    plt.ylabel(feature)
    plt.show()

In [None]:
import pandas as pd

# Ensure 'date' is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Plot 'rain_or_not' over time
plt.figure(figsize=(12, 6))
plt.plot(df['date'], df['rain_or_not'], marker='o', linestyle='None')
plt.title('Rain or Not Over Time')
plt.xlabel('Date')
plt.ylabel('Rain (1) or No Rain (0)')
plt.show()

In [None]:
# Select numerical features for correlation
corr_matrix = df[numerical_features].corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
# Create pair plots for numerical features, colored by 'rain_or_not'
sns.pairplot(df, vars=numerical_features, hue='rain_or_not', palette='Set1')
plt.suptitle('Pair Plots of Numerical Features by Rain or Not', y=1.02)
plt.show()

In [None]:
# Group by 'rain_or_not' and compute summary statistics
summary_stats = df.groupby('rain_or_not')[numerical_features].describe()

# Display the summary statistics
print(summary_stats)