In [1]:
import pandas as pd 
from sqlalchemy import create_engine
from sqlalchemy.engine import URL

In [2]:
from local_config import DB_USER, DB_PASSWORD, DB_SERVER_NAME, DB_DATABASE_NAME
connection_url = URL.create(
    drivername = "postgresql+pg8000", 
    username = DB_USER,
    password = DB_PASSWORD,
    host = DB_SERVER_NAME, 
    port = 5432,
    database = DB_DATABASE_NAME, 
)

engine = create_engine(connection_url)

In [3]:
atmosphere_df = pd.read_sql("atmosphere", engine)
city_df = pd.read_sql("city", engine).rename(columns={"name": "city_name"})
temperature_df = pd.read_sql("temperature", engine)

In [4]:
staging_df = pd.merge(atmosphere_df, temperature_df, on=["city_id", "datetime"], how="inner")
staging_df["date"] = staging_df["datetime"].astype('datetime64').dt.date
merged_df = pd.merge(staging_df, city_df, on=["city_id"], how="inner")
merged_df["city_name"] = merged_df["city_name"].str.lower()

In [5]:
merged_df.head()

Unnamed: 0,city_id,datetime,main_pressure,main_humidity,main_temp,main_feels_like,main_temp_min,main_temp_max,date,city_name,coord_lon,coord_lat
0,2172517,2022-02-13T06:25:27,1020.0,44.0,26.21,26.21,24.81,28.58,2022-02-13,canberra,149.1281,-35.2835
1,2172517,2022-02-13T06:58:06,1021.0,44.0,26.03,26.03,24.81,27.43,2022-02-13,canberra,149.1281,-35.2835
2,2172517,2022-02-13T07:21:27,1019.0,43.0,26.09,26.09,25.3,27.43,2022-02-13,canberra,149.1281,-35.2835
3,2172517,2022-02-13T08:39:16,1020.0,45.0,23.94,23.56,21.3,26.36,2022-02-13,canberra,149.1281,-35.2835
4,2172517,2022-02-13T07:44:19,1019.0,44.0,25.49,25.24,24.49,26.82,2022-02-13,canberra,149.1281,-35.2835


In [6]:
grouped_df = merged_df[[
    "main_pressure", 
    "main_humidity", 
    "city_name",
    "date",
    "main_temp"
]].groupby(by=["city_name", "date"]).mean().reset_index()
grouped_df = grouped_df[grouped_df["main_temp"]<50] # remove temps that were recorded in farenheit 
grouped_df.head()

Unnamed: 0,city_name,date,main_pressure,main_humidity,main_temp
0,adelaide,2022-02-13,1015.357895,43.957895,23.812842
1,adelaide,2022-02-14,1018.985401,51.474453,24.64562
2,adelaide,2022-02-15,1019.285714,46.357143,28.414464
3,brisbane,2022-02-13,1021.166667,81.4375,21.560208
4,brisbane,2022-02-14,1022.612676,88.471831,21.439155


In [7]:
clean_df = pd.get_dummies(grouped_df,columns=["city_name"])

In [8]:
X = clean_df[[
    "main_pressure", 
    "main_humidity", 
    "city_name_adelaide", 
    "city_name_brisbane", 
    "city_name_canberra", 
    "city_name_darwin", 
    "city_name_hobart", 
    "city_name_melbourne", 
    "city_name_perth", 
    "city_name_sydney"
]]

y = clean_df["main_temp"].values.reshape(-1,1)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)



In [11]:
# scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)



In [12]:
# train the model 
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)
model.score(X_test_scaled, y_test_scaled)

0.5407394133250206

In [13]:
# create a city encoder 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(grouped_df[[
    "city_name"
]])

OneHotEncoder()

In [18]:
# save model 
import joblib 
joblib.dump(model, "../app/static/py/model.sav")
joblib.dump(y_scaler, "../app/static/py/y_scaler.sav")
joblib.dump(X_scaler, "../app/static/py/x_scaler.sav")
joblib.dump(encoder, "../app/static/py/encoder.sav")

['../app/static/py/encoder.sav']

In [19]:
def predict(user_inputs):
    # load model binaries 
    model = joblib.load("../app/static/py/model.sav")
    encoder = joblib.load("../app/static/py/encoder.sav")
    X_scaler = joblib.load("../app/static/py/x_scaler.sav")
    y_scaler  = joblib.load("../app/static/py/y_scaler.sav")

    # get the user input data 
    pressure = user_inputs["pressure"]
    humidity = user_inputs["humidity"]
    city_name = user_inputs["city_name"]
    
    # store city names into a df 
    city_input_df = pd.DataFrame({
        "city_name": [city_name]
    })

    # use encoder to transform the city df 
    X_transformed = encoder.transform(city_input_df)
    city_df = pd.DataFrame(columns=[*encoder.categories_], data=X_transformed.toarray())
    
    # store pressure and humidty into df 
    input_df = pd.DataFrame({
        "pressure": [pressure],
        "humidity": [humidity]
    })

    # combine both df's using indexes 
    df = input_df.merge(city_df, left_index=True, right_index=True)

    # scale the X input df 
    X_scaled = X_scaler.transform(df)

    # obtain prediction (y) 
    prediction_scaled = model.predict(X_scaled)
    
    # scale prediction to human readable terms i.e. celcius 
    prediction = y_scaler.inverse_transform(prediction_scaled)
    return prediction[0][0]

In [20]:
# test a single prediction 
user_inputs = {
    "pressure": 1011, 
    "humidity": 50,
    "city_name": "perth"   
}
print(f"predicted temp is: {predict(user_inputs).round(2)} celcius")

predicted temp is: 33.99 celcius


