In [2]:
import pandas as pd

# Load data
companies = pd.read_csv("../data/01_raw/companies.csv") 
shuttles = pd.read_excel("../data/01_raw/shuttles.xlsx")
reviews = pd.read_csv("../data/01_raw/reviews.csv") 

In [6]:
companies.head(5)

Unnamed: 0,id,company_rating,company_location,total_fleet_count,iata_approved
0,3888,1.0,Isle of Man,1.0,True
1,46728,1.0,,1.0,True
2,34618,0.38,Isle of Man,1.0,True
3,28619,1.0,Bosnia and Herzegovina,1.0,True
4,8240,,Chile,1.0,True


In [None]:
# Preprocess data
companies['iata_approved'] = companies['iata_approved'].astype(bool)
companies['company_rating'] = companies['company_rating'].str.replace('%', '').astype(float) / 100

shuttles["d_check_complete"] = shuttles["d_check_complete"].astype(bool)
shuttles["moon_clearance_complete"] = shuttles["moon_clearance_complete"].astype(bool)
shuttles["price"] = shuttles["price"].str.replace("$", "").str.replace(",", "").astype(float)

In [3]:
# Create model input table
rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id")
rated_shuttles = rated_shuttles.drop("id", axis=1)
model_input_table = rated_shuttles.merge(companies, left_on="company_id", right_on="id")
model_input_table = model_input_table.dropna()

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Split data
features = ["engines", "passenger_capacity", "crew", "d_check_complete", "moon_clearance_complete", "iata_approved", "company_rating", 
            "review_scores_rating"]
X = model_input_table[features]
y = model_input_table["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [5]:
# Train model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Evaluate model
y_pred = regressor.predict(X_test)
score = r2_score(y_test, y_pred)
score

[1;36m0.38788878620813416[0m

In [None]:
# Step 1: Modularise the code

# Step 2: Call functions from main method:

# def main():
#     # Load data
#     companies, shuttles, reviews = load_data()

#     # Preprocess data
#     preprocessed_companies = preprocess_companies(companies)
#     preprocessed_shuttles = preprocess_shuttles(shuttles)

#     # Create model input table
#     model_input_table = create_model_input_table(preprocessed_companies, preprocessed_shuttles, reviews)

#     # Split data
#     features = ["engines", "passenger_capacity", "crew", "d_check_complete", "moon_clearance_complete", "iata_approved", "company_rating", "review_scores_rating"]
#     X_train, X_test, y_train, y_test = split_data(model_input_table, features)

#     # Train model
#     regressor = train_model(X_train, y_train)

#     # Evaluate model
#     score = evaluate_model(regressor, X_test, y_test)
#     print(f"The model's R2 score is {score}")