# Model building

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Function to load data from an SQLite database
def load_data(database_path, query):
    conn = sqlite3.connect(database_path)
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

# Function to preprocess data
def preprocess_data(df):
    df = df.drop(["fsq_id", "category_id", "location_postcode", "location_region", "location_timezone",
                  'name_x', "name_y", "name", "chains", "location_country", "location_cross_street",
                  "location_formatted_address", "location_locality", "category_name", "category"], axis=1)
    
    df.price = df.price.map({'$': 0, '$$': 1, '$$$': 2, '$$$$': 3})
    df = df.fillna(0)

    return df

# Function to train and evaluate a model
def train_and_evaluate(X, y):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    print(model.summary())

database_path = 'G:/Data/Python Project/database.sqlite'
query = "SELECT * FROM merged_data"

# Load data
df = load_data(database_path, query)

# Preprocess data
df = preprocess_data(df)

# Split the data into features and target
X = df.drop("bikes_available", axis=1)
y = df["bikes_available"]

# Train and evaluate the model
train_and_evaluate(X, y)


                            OLS Regression Results                            
Dep. Variable:        bikes_available   R-squared:                       0.176
Model:                            OLS   Adj. R-squared:                  0.166
Method:                 Least Squares   F-statistic:                     18.17
Date:                Fri, 23 Jun 2023   Prob (F-statistic):           9.27e-25
Time:                        02:02:57   Log-Likelihood:                -2195.3
No. Observations:                 691   AIC:                             4409.
Df Residuals:                     682   BIC:                             4449.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         3442.6150    558.051      6.169   

## RESULTS: the model is not good fit because R-square is too low which show a low correlation.