In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
import joblib
import numpy as np


In [2]:
import pandas as pd
# Load dataset
df = pd.read_csv('top100cities_weather_data.csv')
df

Unnamed: 0,City,Temperature (Celsius),Wind Speed (m/s),Latitude,Longitude,Description,Country
0,Paris,10.46,2.06,48.8534,2.3488,clear sky,France
1,Dubai,29.32,3.09,25.2582,55.3047,clear sky,United Arab Emirates
2,Madrid,9.97,3.60,40.4165,-3.7026,clear sky,Spain
3,Tokyo,21.58,2.06,35.6895,139.6917,broken clouds,Japan
4,Amsterdam,8.00,3.58,52.3740,4.8897,clear sky,Netherlands
...,...,...,...,...,...,...,...
95,Punta Cana,26.87,6.69,18.5818,-68.4043,few clouds,Dominican Republic
96,Guilin,24.06,0.97,25.2819,110.2864,clear sky,China
97,Hanoi,28.00,2.49,21.0245,105.8412,clear sky,Vietnam
98,Cairo,19.42,4.63,30.0626,31.2497,clear sky,Egypt


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   City                   100 non-null    object 
 1   Temperature (Celsius)  100 non-null    float64
 2   Wind Speed (m/s)       100 non-null    float64
 3   Latitude               100 non-null    float64
 4   Longitude              100 non-null    float64
 5   Description            100 non-null    object 
 6   Country                100 non-null    object 
dtypes: float64(4), object(3)
memory usage: 5.6+ KB


In [4]:
df.isnull().sum()

City                     0
Temperature (Celsius)    0
Wind Speed (m/s)         0
Latitude                 0
Longitude                0
Description              0
Country                  0
dtype: int64

In [5]:
df.describe(include='all')


Unnamed: 0,City,Temperature (Celsius),Wind Speed (m/s),Latitude,Longitude,Description,Country
count,100,100.0,100.0,100.0,100.0,100,100
unique,100,,,,,13,47
top,Paris,,,,,clear sky,United States
freq,1,,,,,40,9
mean,,18.8617,3.5992,31.249418,18.550782,,
std,,7.017078,2.66491,19.451234,75.876596,,
min,,5.29,0.45,-34.6132,-157.8583,,
25%,,13.825,1.54,23.4896,-8.741575,,
50%,,18.005,2.925,35.17365,18.5524,,
75%,,26.12,4.63,43.825025,74.154525,,


In [6]:
# Check and droping missing values (if any)
df.dropna(inplace=True)


In [7]:
# Label encoding
le_city = LabelEncoder()
le_country = LabelEncoder()
le_weather = LabelEncoder()


In [8]:
df['City_encoded'] = le_city.fit_transform(df['City'])
df['Country_encoded'] = le_country.fit_transform(df['Country'])
df['Weather_encoded'] = le_weather.fit_transform(df['Description'])

In [9]:
# Features: Include Lat, Long, Encoded City & Country
X = df[['City_encoded', 'Country_encoded', 'Latitude', 'Longitude']]
y_temp = df['Temperature (Celsius)']
y_wind = df['Wind Speed (m/s)']
y_weather = df['Weather_encoded']

In [10]:
# Splitting
X_train_reg, X_test_reg, y_train_temp, y_test_temp, y_train_wind, y_test_wind = train_test_split(
    X, y_temp, y_wind, test_size=0.2, random_state=42)

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X, y_weather, test_size=0.2, random_state=42)



In [11]:
# Models with slightly better tuned parameters
temp_model = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42)
wind_model = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42)
weather_model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)

In [12]:
# Train
temp_model.fit(X_train_reg, y_train_temp)
wind_model.fit(X_train_reg, y_train_wind)
weather_model.fit(X_train_cls, y_train_cls)


In [13]:
# Predictions
temp_preds = temp_model.predict(X_test_reg)
wind_preds = wind_model.predict(X_test_reg)
weather_preds = weather_model.predict(X_test_cls)

In [14]:
# Evaluation metrics
rmse_temp = np.sqrt(mean_squared_error(y_test_temp, temp_preds))
rmse_wind = np.sqrt(mean_squared_error(y_test_wind, wind_preds))
weather_acc = accuracy_score(y_test_cls, weather_preds)


In [15]:
print("Temperature RMSE:",rmse_temp)
print("Wind Speed RMSE:", rmse_wind)
print("Weather Description Accuracy:",weather_acc)


Temperature RMSE: 3.5357876255687337
Wind Speed RMSE: 2.4401978091693697
Weather Description Accuracy: 0.3


In [16]:
# Sample input for demo (replace with user input in real usage)
sample_input = pd.DataFrame([[0, 0, 40.71, -74.01]], columns=['City_encoded', 'Country_encoded', 'Latitude', 'Longitude'])
probs = weather_model.predict_proba(sample_input)[0]
#new york example


# Get prediction probabilities
probs = weather_model.predict_proba(sample_input)[0]

# Get top 3 predictions
top3_idx = np.argsort(probs)[-3:][::-1]
top3_labels = le_weather.inverse_transform(top3_idx)
top3_probs = probs[top3_idx]

# Format output
print(f"Most likely: {top3_labels[0]} ({top3_probs[0] * 100:.1f}%)")
print(f"Next: {top3_labels[1]} ({top3_probs[1] * 100:.1f}%)")
print(f"Then: {top3_labels[2]} ({top3_probs[2] * 100:.1f}%)")



Most likely: clear sky (41.5%)
Next: scattered clouds (24.5%)
Then: heavy intensity rain (11.5%)


In [74]:
import joblib

joblib.dump(temp_model, "model/temp_model.pkl")
joblib.dump(wind_model, "model/wind_model.pkl")
joblib.dump(weather_model, "model/weather_model.pkl")
joblib.dump((le_city, le_country, le_weather), "model/encoders.pkl")


['model/encoders.pkl']