In [None]:
# ================================================
# üå¶Ô∏è Weather Data Analysis (EDA & Feature Engineering)
# Repository: data-science-projects
# ================================================


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)


In [None]:
# ‚úÖ Step 1: Load the dataset

base_dir = os.path.dirname(os.path.dirname(os.getcwd()))
data_path = os.path.join(base_dir, "weather_eda", "data", "weatherHistory.csv")

df = pd.read_csv(data_path)

print("‚úÖ Dataset Loaded Successfully!")
print("üìä Shape:", df.shape)
print("\nüìã Columns:\n", df.columns)


‚úÖ Dataset Loaded Successfully!
üìä Shape: (96453, 12)

üìã Columns:
 Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover', 'Pressure (millibars)', 'Daily Summary'], dtype='object')


In [None]:
# ================================================
# üíæ Plot Auto-Save Utility ‚Äî 
# ================================================

try:
    current_file = os.path.abspath(__file__)
    project_root = os.path.dirname(os.path.dirname(current_file))
except NameError:
    project_root = os.path.dirname(os.getcwd())

if os.path.basename(project_root) == "src":
    project_root = os.path.dirname(project_root)

# Paths
result_dir = os.path.join(project_root, "result")
plots_dir = os.path.join(result_dir, "plots")
os.makedirs(plots_dir, exist_ok=True)

# üïí Timestamp for versioning
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

def save_plot(filename):
    """
    Save the current matplotlib plot to /result/plots outside /src/.
    """
    full_path = os.path.join(plots_dir, f"{filename}_{timestamp}.png")
    plt.savefig(full_path, bbox_inches="tight", dpi=300)
    plt.close()
    print(f"‚úÖ Plot saved to: {full_path}")


In [69]:
# üìä Quick Dataset Summary
print("üìÖ Date Range:", df["Formatted Date"].min(), "‚Üí", df["Formatted Date"].max())
print("üìÇ Total Records:", len(df))
print("üèôÔ∏è Unique Weather Conditions:", df["Summary"].nunique())


üìÖ Date Range: 2005-12-31 23:00:00+00:00 ‚Üí 2016-12-31 22:00:00+00:00
üìÇ Total Records: 96453
üèôÔ∏è Unique Weather Conditions: 27


In [None]:
# ‚úÖ Step 2: Overview

print("\nüîç Basic Info:")
print(df.info())

print("\nüìà Descriptive Statistics:")
display(df.describe().T)



üîç Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   Formatted Date            96453 non-null  datetime64[ns, UTC]
 1   Summary                   96453 non-null  object             
 2   Precip Type               95936 non-null  object             
 3   Temperature (C)           96453 non-null  float64            
 4   Apparent Temperature (C)  96453 non-null  float64            
 5   Humidity                  96453 non-null  float64            
 6   Wind Speed (km/h)         96453 non-null  float64            
 7   Wind Bearing (degrees)    96453 non-null  float64            
 8   Visibility (km)           96453 non-null  float64            
 9   Loud Cover                96453 non-null  float64            
 10  Pressure (millibars)      96453 non-null  float64            
 1

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature (C),96453.0,11.932678,9.551546,-21.822222,4.688889,12.0,18.838889,39.905556
Apparent Temperature (C),96453.0,10.855029,10.696847,-27.716667,2.311111,12.0,18.838889,39.344444
Humidity,96453.0,0.734899,0.195473,0.0,0.6,0.78,0.89,1.0
Wind Speed (km/h),96453.0,10.81064,6.913571,0.0,5.8282,9.9659,14.1358,63.8526
Wind Bearing (degrees),96453.0,187.509232,107.383428,0.0,116.0,180.0,290.0,359.0
Visibility (km),96453.0,10.347325,4.192123,0.0,8.3398,10.0464,14.812,16.1
Loud Cover,96453.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pressure (millibars),96453.0,1003.235956,116.969906,0.0,1011.9,1016.45,1021.09,1046.38
Year,96453.0,2011.00029,3.16216,2005.0,2008.0,2011.0,2014.0,2016.0
Month,96453.0,6.523001,3.448501,1.0,4.0,7.0,10.0,12.0


In [34]:
# ‚úÖ Step 3: Handle Missing Values

missing = df.isnull().sum()
print("\n‚ùó Missing Values per Column:\n", missing)

# Drop or fill missing values
df = df.dropna(subset=["Temperature (C)", "Apparent Temperature (C)", "Humidity"])



‚ùó Missing Values per Column:
 Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64


In [71]:
# ‚úÖ Step 4: Convert date column to datetime format

df["Formatted Date"] = pd.to_datetime(df["Formatted Date"], utc=True)
df["Year"] = df["Formatted Date"].dt.year
df["Month"] = df["Formatted Date"].dt.month
df["Hour"] = df["Formatted Date"].dt.hour

df.head(3)


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Year,Month,Hour,Temp_Diff,is_Rainy
0,2006-03-31 22:00:00+00:00,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,2006,3,22,-2.083333,0
1,2006-03-31 23:00:00+00:00,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,2006,3,23,-2.127778,0
2,2006-04-01 00:00:00+00:00,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,2006,4,0,0.0,0


In [36]:
# ‚úÖ Step 5: Feature Engineering

df["Temp_Diff"] = df["Apparent Temperature (C)"] - df["Temperature (C)"]
df["is_Rainy"] = df["Summary"].str.contains("Rain", case=False, na=False).astype(int)

df[["Temperature (C)", "Apparent Temperature (C)", "Temp_Diff", "is_Rainy"]].head()


Unnamed: 0,Temperature (C),Apparent Temperature (C),Temp_Diff,is_Rainy
0,9.472222,7.388889,-2.083333,0
1,9.355556,7.227778,-2.127778,0
2,9.377778,9.377778,0.0,0
3,8.288889,5.944444,-2.344444,0
4,8.755556,6.977778,-1.777778,0


In [72]:
# üö® Outlier Detection (IQR method)
Q1 = df["Temperature (C)"].quantile(0.25)
Q3 = df["Temperature (C)"].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df["Temperature (C)"] < (Q1 - 1.5 * IQR)) | (df["Temperature (C)"] > (Q3 + 1.5 * IQR))]
print(f"‚ö†Ô∏è Outliers Detected: {len(outliers)} records ({len(outliers)/len(df)*100:.2f}%)")


‚ö†Ô∏è Outliers Detected: 44 records (0.05%)


In [38]:
# üîç Correlation with Temperature
temp_corr = df.corr(numeric_only=True)["Temperature (C)"].sort_values(ascending=False)
print("üî• Correlation with Temperature:\n", temp_corr)


üî• Correlation with Temperature:
 Temperature (C)             1.000000
Apparent Temperature (C)    0.992629
Temp_Diff                   0.635285
Visibility (km)             0.392847
Month                       0.148576
Hour                        0.099540
Wind Bearing (degrees)      0.029988
Year                        0.017462
Wind Speed (km/h)           0.008957
Pressure (millibars)       -0.005447
is_Rainy                   -0.005477
Humidity                   -0.632255
Loud Cover                       NaN
Name: Temperature (C), dtype: float64


In [None]:
# üå¨Ô∏è Wind Bearing Polar Plot
plt.figure(figsize=(6,6))
ax = plt.subplot(111, polar=True)
theta = np.deg2rad(df["Wind Bearing (degrees)"])
ax.hist(theta, bins=36, color='teal', alpha=0.7)
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
plt.title("üß≠ Wind Direction Frequency")

save_plot("wind_direction") 
# plt.show()


  plt.savefig(full_path, bbox_inches="tight", dpi=300)


‚úÖ Plot saved to: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/plots/wind_direction_20251013_120118.png


In [60]:
# üå°Ô∏è Distribution of Key Variables
features = ["Temperature (C)", "Humidity", "Wind Speed (km/h)", "Pressure (millibars)"]
df[features].hist(bins=40, figsize=(10, 6), color="skyblue", edgecolor="black")
plt.suptitle("üìä Distribution of Main Weather Features", fontsize=14)
save_plot("dist_of_main_weather_features") 
# plt.show()


  plt.savefig(full_path, bbox_inches="tight", dpi=300)


‚úÖ Plot saved to: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/plots/dist_of_main_weather_features_20251013_120118.png


In [61]:
# ‚úÖ Step 6: Correlation Analysis

numeric_cols = df.select_dtypes(include=[np.number])
corr = numeric_cols.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, cmap="coolwarm", annot=True, fmt=".2f")
plt.title("üî• Correlation Heatmap")
save_plot("correlation_heatmap") 
# plt.show()


  plt.savefig(full_path, bbox_inches="tight", dpi=300)


‚úÖ Plot saved to: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/plots/correlation_heatmap_20251013_120118.png


In [62]:
# ‚úÖ Step 7: Temperature Trends Over Time

plt.figure(figsize=(10, 5))
sns.lineplot(data=df.sample(1000), x="Formatted Date", y="Temperature (C)", color="orange")
plt.title("üå°Ô∏è Temperature Variation Over Time")
plt.xlabel("Date")
plt.ylabel("Temperature (¬∞C)")
save_plot("temperature_var") 
plt.show()


  plt.savefig(full_path, bbox_inches="tight", dpi=300)


‚úÖ Plot saved to: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/plots/temperature_var_20251013_120118.png


In [None]:
# ‚úÖ Step 8: Monthly Average Temperature

monthly_avg = df.groupby("Month")["Temperature (C)"].mean()
monthly_avg.plot(kind="bar", color="skyblue")
plt.title("üìÜ Average Monthly Temperature")
plt.xlabel("Month")
plt.ylabel("Avg Temp (¬∞C)")
save_plot("ave_temperature") 
# plt.show()


‚úÖ Plot saved to: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/plots/ave_temperature_20251013_120118.png


  plt.savefig(full_path, bbox_inches="tight", dpi=300)


In [None]:
# ‚úÖ Step 9: Weather Condition Distribution

plt.figure(figsize=(8, 5))
df["Summary"].value_counts().head(10).plot(kind="barh", color="lightgreen")
plt.title("‚òÅÔ∏è Top 10 Weather Conditions")
plt.xlabel("Count")
plt.ylabel("Condition")
save_plot("top_weather_conditions") 
# plt.show()


‚úÖ Plot saved to: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/plots/top_weather_conditions_20251013_120118.png


In [45]:
# üß† Insights Summary
print("üìç Insights Summary:")
print(f"‚Ä¢ Average temperature: {df['Temperature (C)'].mean():.2f}¬∞C")
print(f"‚Ä¢ Highest temperature recorded: {df['Temperature (C)'].max():.2f}¬∞C")
print(f"‚Ä¢ Most common weather condition: {df['Summary'].mode()[0]}")
# print(f"‚Ä¢ Rainy days detected: {df['is_Rainy'].sum()} out of {len(df)} ({(df['is_Rainy'].sum()/len(df))*100:.1f}%)")


üìç Insights Summary:
‚Ä¢ Average temperature: 11.93¬∞C
‚Ä¢ Highest temperature recorded: 39.91¬∞C
‚Ä¢ Most common weather condition: Partly Cloudy


In [None]:
# ================================================
# üíæ Data Saving Utility ‚Äî Save results to project root
# ================================================

try:
    project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
except NameError:
    project_root = os.getcwd()
    
if os.path.basename(project_root) == "src":
    project_root = os.path.dirname(project_root)

result_dir = os.path.join(project_root, "result")
os.makedirs(result_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

clean_path = os.path.join(result_dir, f"cleaned_weather_{timestamp}.csv")
corr_path = os.path.join(result_dir, f"correlation_{timestamp}.csv")

# üíæ Save cleaned dataset and correlation matrix
df.to_csv(clean_path, index=False)
temp_corr.to_csv(corr_path, header=True)

print("‚úÖ Files saved successfully at project root!")
print(f"üìÅ Cleaned dataset: {clean_path}")
print(f"üìÅ Correlation matrix: {corr_path}")


‚úÖ Files saved successfully at project root!
üìÅ Cleaned dataset: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/cleaned_weather_20251013_121031.csv
üìÅ Correlation matrix: /home/dennis/Desktop/my_git_project/data_science_predict/weather_eda/result/correlation_20251013_121031.csv
