# Live Data Pipeline

In [1]:
host = r'127.0.0.1' 
db = r'MSDS610'
user = r'postgres' 
pw = r'8751' 
port = r'5432'

from sqlalchemy import create_engine

engine = create_engine(f'postgresql://{user}:{pw}@{host}:{port}/{db}')

In [2]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# Read tables operations and vehicles from cleaned schema
operations_df = pd.read_sql_table('operations', con=engine, schema='cleaned')
vehicles_df = pd.read_sql_table('vehicles', con=engine, schema='raw')

In [3]:
operations_df

Unnamed: 0,operation,feature,action,value
0,Dropping the features that are not relevant fo...,"id, url, region, region_url, VIN, image_url, d...",drop,
1,Dropping the rows with missing values,"condition, cylinders, drive, paint_color",drop,"condition, cylinders, drive, paint_color"
2,"Replacing null values of manufacturer, model, ...","manufacturer, type",fillna,unknown
3,Replacing null values of odometer with the median,odometer,fillna,103928.0
4,Replacing null values of cylinders with the mode,cylinders,fillna,6 cylinders
5,Replacing null values of transmission with the...,transmission,fillna,automatic
6,Applying log transformation to price,price,log1p,log1p
7,Mapping cylinders to ordinal values,cylinders,map,"{'3 cylinders': 3, '4 cylinders': 4, '5 cylind..."
8,Mapping condition to ordinal values,condition,map,"{'new': 6, 'excellent': 5, 'like new': 4, 'goo..."
9,Creating new features for car price prediction,car_age,create,current_year - year


In [4]:
import numpy as np
import datetime

def clean_data(df, operations_df):
    for index, row in operations_df.iterrows():
        operation = row['operation']
        value = row['value']
        feature = row['feature']
        
        if operation == 'Dropping the features that are not relevant for the analysis':
            df.drop(columns=feature.split(', '), inplace=True)
            print("Dropped the features that are not relevant for the analysis")
        elif operation == 'Dropping the rows with missing values':
            df.dropna(subset=value.split(', '), inplace=True)
            print("Dropped the rows with missing values")
        elif operation == 'Replacing null values of manufacturer, model, type with unknown':
            df[feature.split(', ')] = df[feature.split(', ')].fillna('unknown')
            print("Replaced null values of manufacturer, model, and paint_color with unknown")
        elif operation == 'Replacing null values of odometer with the median':
            median = value
            df['odometer'].fillna(float(median), inplace=True)
            print("Replaced null values of odometer with the median")
        elif operation == 'Replacing null values of cylinders with the mode':
            mode = value
            df['cylinders'].fillna(mode, inplace=True)
            print("Replaced null values of cylinders with the mode")
        elif operation == 'Replacing null values of transmission with the most frequent value':
            mode = value
            df['transmission'].fillna(mode, inplace=True)
            print("Replaced null values of transmission with the most frequent value")
        elif operation == 'Applying log transformation to price':
            df['price'] = np.log1p(df['price'])
            print("Applied log transformation to price")
        elif operation == 'Mapping cylinders to ordinal values':
            mapping = eval(value)
            df['cylinders'] = df['cylinders'].map(mapping)
            print("Mapped cylinders to ordinal values")
        elif operation == 'Mapping condition to ordinal values':
            mapping = eval(value)
            df['condition'] = df['condition'].map(mapping)
            print("Mapped condition to ordinal values")
        elif operation == 'Creating new features for car price prediction':
            if feature == 'car_age':
                current_year = datetime.datetime.now().year
                print(current_year)
                df['car_age'] = current_year - df['year']
                print("Created car_age feature")
            elif feature == 'odometer_binned':
                bins = eval(value.split(', ')[0].split('=')[1])
                labels = value.split(', ')[1].split('=')[1].strip('[]').split(', ')
                df['odometer_binned'] = pd.cut(df['odometer'], bins=bins, labels=labels)
                print("Created odometer_binned feature")
            elif feature == 'is_premium_brand':
                luxury_brands = value.split(', ')
                df['is_premium_brand'] = df['manufacturer'].apply(lambda x: 1 if x in luxury_brands else 0)
                print("Created is_premium_brand feature")
    
        elif operation == 'Dropping Year column':
            df.drop(columns=value, inplace=True)
            print("Dropped Year column")
        elif operation == 'Applying one-hot encoding to categorical features':
            df = pd.get_dummies(df, columns=value.split(', '), dtype=int)
            print("Applied one-hot encoding to categorical features")
        elif operation == 'Adding the final features to the operations':
            interaction_terms = value.split(', ')
            
            # Adding missing feature columns
            missing_features = set(interaction_terms) - set(df.columns)
            print(missing_features)
            for feature in missing_features:
                df[feature] = 0

            # Remove extra features that not in the trained feature list
            extra_features = set(df.columns) - set(interaction_terms)
            df.drop(columns=extra_features, inplace=True)
            print("Added the final features to the operations")
    
    return df[interaction_terms]

In [5]:
operations_df.operation

0     Dropping the features that are not relevant fo...
1                 Dropping the rows with missing values
2     Replacing null values of manufacturer, model, ...
3     Replacing null values of odometer with the median
4      Replacing null values of cylinders with the mode
5     Replacing null values of transmission with the...
6                  Applying log transformation to price
7                   Mapping cylinders to ordinal values
8                   Mapping condition to ordinal values
9        Creating new features for car price prediction
10       Creating new features for car price prediction
11       Creating new features for car price prediction
12                                 Dropping Year column
13    Applying one-hot encoding to categorical features
14          Adding the final features to the operations
Name: operation, dtype: object

In [6]:
cleaned_vehicles_df = clean_data(vehicles_df, operations_df)

Dropped the features that are not relevant for the analysis
Dropped the rows with missing values
Replaced null values of manufacturer, model, and paint_color with unknown
Replaced null values of odometer with the median
Replaced null values of cylinders with the mode
Applied log transformation to price
Mapped cylinders to ordinal values
Mapped condition to ordinal values
2025
Created car_age feature
Created is_premium_brand feature
Dropped Year column
Applied one-hot encoding to categorical features
{'manufacturer_mercury', 'manufacturer_ferrari', 'manufacturer_infiniti', 'title_status_lien', 'type_hatchback', 'paint_color_purple', 'manufacturer_cadillac', 'manufacturer_alfa-romeo', 'manufacturer_volvo', 'manufacturer_morgan', 'paint_color_custom', 'manufacturer_nissan', 'manufacturer_land rover', 'type_offroad', 'manufacturer_jaguar', 'manufacturer_fiat', 'manufacturer_mazda', 'manufacturer_pontiac', 'manufacturer_tesla', 'manufacturer_mini', 'fuel_hybrid', 'type_other', 'manufacturer

In [7]:
cleaned_vehicles_df.to_csv('cleaned_vehicles_df.csv', index=False)

In [8]:
cleaned_vehicles_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39 entries, 0 to 93
Data columns (total 93 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   price                         39 non-null     float64
 1   condition                     39 non-null     int64  
 2   cylinders                     39 non-null     int64  
 3   odometer                      39 non-null     float64
 4   car_age                       39 non-null     float64
 5   odometer_category             39 non-null     int64  
 6   is_premium_brand              39 non-null     int64  
 7   manufacturer_acura            39 non-null     int64  
 8   manufacturer_alfa-romeo       39 non-null     int64  
 9   manufacturer_aston-martin     39 non-null     int64  
 10  manufacturer_audi             39 non-null     int64  
 11  manufacturer_bmw              39 non-null     int32  
 12  manufacturer_buick            39 non-null     int64  
 13  manufacturer

In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Ensure column names are strings and cleaned
cleaned_vehicles_df.columns = cleaned_vehicles_df.columns.astype(str).str.strip("'").str.strip('"')

# Keep only numeric columns
cleaned_vehicles_df = cleaned_vehicles_df.select_dtypes(include=['number'])

# Split the data into X and y
X = cleaned_vehicles_df.drop(columns=['price'])
y = cleaned_vehicles_df['price']

# Initialize and apply MinMaxScaler to X
scaler = MinMaxScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Combine scaled X and y back into a single DataFrame
scaled_vehicles_df = pd.concat([scaled_X, y.reset_index(drop=True)], axis=1)

scaled_vehicles_df.head()  # Verify output

Unnamed: 0,condition,cylinders,odometer,car_age,odometer_category,is_premium_brand,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,...,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow,price
0,0.333333,0.75,0.093797,0.044776,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.51843
1,0.333333,1.0,0.272467,0.208955,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.239817
2,1.0,0.75,0.61199,0.268657,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.917536
3,1.0,0.5,0.089867,0.074627,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.635216
4,0.333333,0.5,0.29166,0.179104,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.412055


In [12]:
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Load the trained model
model = joblib.load('best_rf_model.pkl')

# Predict y using the loaded model
y_pred = model.predict(scaled_X)

# Calculate and print RMSE, MAE, R2
rmse = np.sqrt(mean_squared_error(y, y_pred))
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")

RMSE: 2.990998875755298
MAE: 2.2481182813291847
R2: -0.7391221104615937


In [13]:
# Add y_pred to scaled_vehicles_df
scaled_vehicles_df['predicted_price'] = y_pred

# Insert the DataFrame into the database
scaled_vehicles_df.to_sql('predicted_car_prices', engine, schema='analytics', if_exists='replace', index=False)

print("Data inserted into the database successfully.")


Data inserted into the database successfully.


# Insights from Live Data Analysis

## Model Performance Metrics

The Random Forest model was evaluated on live data with the following performance metrics:

- **RMSE (Root Mean Square Error)**: 2.991
- **Mean Absolute Error (MAE)**: 2.248
- **R² Score**: -0.739

## Key Findings

1. **Prediction Accuracy**: The model's negative \(R^2\) score indicates that it performs worse than a simple mean-based model. This suggests significant challenges in capturing the underlying patterns in the data.

2. **Error Analysis**: The RMSE of 2.991 and MAE of 2.248 highlight substantial prediction errors, indicating that the model's predictions deviate significantly from actual values.

3. **Scaling Discrepancies**: A potential issue contributing to poor performance is the difference in min-max scaling between the training and live datasets. This discrepancy can lead to inconsistent feature scaling, affecting model predictions.


4. **Data Variability**: The live data may exhibit variability not present in the training data, highlighting the importance of robust model validation and adaptation to real-world conditions.

## Challenges in Train Data Model Prediction on Live Data

1. **Inconsistent Scaling**: The difference in scaling parameters between training and live data can lead to inaccurate predictions. Ensuring consistent scaling across datasets is crucial.

2. **Data Drift**: Changes in data distribution over time can affect model performance, necessitating continuous monitoring and adaptation.

3. **Feature Relevance**: Features that were significant in the training data may not hold the same importance in live data, requiring ongoing feature evaluation.


# Part 10 - Reflection

Reflecting on this project, one of the most challenging aspects was managing the data pipeline. Ensuring that data was clean, consistent, and ready for model training required meticulous attention to detail. Handling missing values, outliers, and ensuring the data was representative of real-world scenarios was a complex task. Additionally, integrating various tools and libraries to streamline the process was initially overwhelming, but it provided a valuable learning experience in understanding how different components of a machine learning pipeline interact.

On the positive side, I thoroughly enjoyed the model development phase. Experimenting with different algorithms and hyperparameters to optimize performance was both engaging and rewarding. It was fascinating to see how small tweaks could significantly impact the model's accuracy and efficiency. This hands-on experience deepened my understanding of machine learning concepts and reinforced the importance of iterative testing and validation.

Through this project, I gained a clearer understanding of the steps required to deploy a model to production. From data preprocessing and model training to validation and deployment, each phase is crucial and interconnected. I learned the importance of scalability and monitoring in a production environment to ensure the model remains effective over time. This project has equipped me with practical skills and insights that are essential for successfully deploying machine learning models in real-world applications.
