# Model building

In [12]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import eli5
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

# Function to load data from an SQLite database
def load_data(database_path, query):
    conn = sqlite3.connect(database_path)
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

# Function to preprocess data
def preprocess_data(df):
    df = df.drop(["fsq_id", "category_id", "location_postcode", "location_region", "location_timezone",
                  'name_x', "name_y", "name", "chains", "location_country", "location_cross_street",
                  "location_formatted_address", "location_locality", "category_name", "category"], axis=1)
    
    df.price = df.price.map({'$': 0, '$$': 1, '$$$': 2, '$$$$': 3})
    df = df.fillna(0)

    # OneHotEncode price
    price = df.price.values.reshape(-1, 1)
    encoder = OneHotEncoder()
    encoder.fit(price)
    one_hot_encoded = encoder.transform(price).toarray()

    return df

# Function to train and evaluate a model
def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print("R-squared:", reg.score(X_test, y_test))

    X_train = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train).fit()
    print(model.summary())

database_path = 'G:/Data/Python Project/database.sqlite'
query = "SELECT * FROM merged_data"

# Load data
df = load_data(database_path, query)

# Preprocess data
df = preprocess_data(df)

# Split the data into features and target
X = df.drop("bikes_available", axis=1)
y = df["bikes_available"]

# Train and evaluate the model
train_and_evaluate(X, y)


R-squared: 0.15568935331026412
                            OLS Regression Results                            
Dep. Variable:        bikes_available   R-squared:                       0.172
Model:                            OLS   Adj. R-squared:                  0.160
Method:                 Least Squares   F-statistic:                     14.07
Date:                Wed, 07 Jun 2023   Prob (F-statistic):           1.10e-18
Time:                        13:27:14   Log-Likelihood:                -1734.2
No. Observations:                 552   AIC:                             3486.
Df Residuals:                     543   BIC:                             3525.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         272

## RESULTS: the model is not good fit because R-square is too low which show a low correlation.

**Stretch**

In [14]:
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

# Check the balance of your classes
print(y_train.value_counts())

# Separate majority and minority classes
df_majority = df[df.bikes_available==0]
df_minority = df[df.bikes_available==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,                     # sample with replacement
                                 n_samples=df_majority.shape[0],   # to match majority class
                                 random_state=123)                 # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
print(df_upsampled.bikes_available.value_counts())

# Define new X and y
X_upsampled = df_upsampled.drop('bikes_available', axis=1)
y_upsampled = df_upsampled.bikes_available

# Preprocess your data differently: Standardize features
scaler = StandardScaler()
X_upsampled = scaler.fit_transform(X_upsampled)

# Split the data into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

# Tune your model: Use LogisticRegressionCV to find the best regularization strength
logreg = LogisticRegressionCV(cv=5, random_state=42)
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Evaluate your model with a different metric: AUC-ROC score
print("AUC-ROC score:", roc_auc_score(y_test, y_pred))


0    384
1    168
Name: bikes_available_high, dtype: int64
0.0    71
1.0    71
Name: bikes_available, dtype: int64
AUC-ROC score: 0.8052884615384616
