In [1]:
import requests
import pandas as pd

## Data

In [2]:
def fetch_makes(model_year, issue_type):
    base_url = "https://api.nhtsa.gov/products/vehicle/"
    makes_url = f"{base_url}makes?modelYear={model_year}&issueType={issue_type}"
    response = requests.get(makes_url)
    makes_data = response.json()

    if 'results' not in makes_data:
        raise ValueError("Failed to fetch makes data.")

    return [make['make'] for make in makes_data['results']]

def fetch_models(model_year, makes_list, issue_type):
    base_url = "https://api.nhtsa.gov/products/vehicle/"
    models = []
    for make in makes_list:
        models_url = f"{base_url}models?modelYear={model_year}&make={make}&issueType={issue_type}"
        response = requests.get(models_url)
        models_data = response.json()
        if 'results' in models_data:
            models.extend([{
                'modelYear': model['modelYear'],
                'make': model['make'],
                'model': model['model']
            } for model in models_data['results']])
    return pd.DataFrame(models)

def fetch_complaints(models_df):
    complaints = []
    for _, row in models_df.iterrows():
        complaints_url = f"https://api.nhtsa.gov/complaints/complaintsByVehicle?make={row['make']}&model={row['model']}&modelYear={row['modelYear']}"
        response = requests.get(complaints_url)
        complaints_data = response.json()
        if 'results' in complaints_data:
            complaints.extend([{
                'make': row['make'],
                'model': row['model'],
                'modelYear': row['modelYear'],
                'odiNumber': complaint['odiNumber'],
                'manufacturer': complaint['manufacturer'],
                'crash': complaint['crash'],
                'fire': complaint['fire'],
                'numberOfInjuries': complaint['numberOfInjuries'],
                'numberOfDeaths': complaint['numberOfDeaths'],
                'dateOfIncident': complaint.get('dateOfIncident'),
                'dateComplaintFiled': complaint.get('dateComplaintFiled'),
                'vin': complaint.get('vin'),
                'components': complaint.get('components'),
                'summary': complaint.get('summary')
            } for complaint in complaints_data['results']])
    return pd.DataFrame(complaints)

def fetch_recalls(models_df):
    recalls = []
    for _, row in models_df.iterrows():
        recalls_url = f"https://api.nhtsa.gov/recalls/recallsByVehicle?make={row['make']}&model={row['model']}&modelYear={row['modelYear']}"
        response = requests.get(recalls_url)
        recalls_data = response.json()
        if 'results' in recalls_data:
            recalls.extend([{
                'make': row['make'],
                'model': row['model'],
                'modelYear': row['modelYear'],
                'NHTSACampaignNumber': recall['NHTSACampaignNumber'],
                'manufacturer': recall['Manufacturer'],
                'component': recall['Component'],
                'summary': recall['Summary'],
                'consequence': recall.get('Consequence'),
                'remedy': recall.get('Remedy'),
                'notes': recall.get('Notes'),
                'reportDate': recall.get('ReportReceivedDate'),
                'affectedVehicles': recall.get('AffectedVehicles')
            } for recall in recalls_data['results']])
    return pd.DataFrame(recalls)

# Main logic to fetch and save data
model_year = 2020

# Complaints data
makes_list_complaints = fetch_makes(model_year, issue_type='c')
models_df_complaints = fetch_models(model_year, makes_list_complaints, issue_type='c')
complaints_df = fetch_complaints(models_df_complaints)
complaints_df.to_parquet("vehicle_complaints.parquet", index=False)
print("Complaints data saved to vehicle_complaints.parquet")

# Recalls data
makes_list_recalls = fetch_makes(model_year, issue_type='r')
models_df_recalls = fetch_models(model_year, makes_list_recalls, issue_type='r')
recalls_df = fetch_recalls(models_df_recalls)
recalls_df.to_parquet("vehicle_recalls.parquet", index=False)
print("Recalls data saved to vehicle_recalls.parquet")

Complaints data saved to vehicle_complaints.parquet
Recalls data saved to vehicle_recalls.parquet


In [3]:
print(complaints_df.info())
complaints_df.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44519 entries, 0 to 44518
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   make                44519 non-null  object
 1   model               44519 non-null  object
 2   modelYear           44519 non-null  object
 3   odiNumber           44519 non-null  int64 
 4   manufacturer        44519 non-null  object
 5   crash               44519 non-null  bool  
 6   fire                44519 non-null  bool  
 7   numberOfInjuries    44519 non-null  int64 
 8   numberOfDeaths      44519 non-null  int64 
 9   dateOfIncident      44519 non-null  object
 10  dateComplaintFiled  44519 non-null  object
 11  vin                 44482 non-null  object
 12  components          44519 non-null  object
 13  summary             44519 non-null  object
dtypes: bool(2), int64(3), object(9)
memory usage: 4.2+ MB
None


Unnamed: 0,make,model,modelYear,odiNumber,manufacturer,crash,fire,numberOfInjuries,numberOfDeaths,dateOfIncident,dateComplaintFiled,vin,components,summary
22653,HONDA,PILOT,2020,11579758,Honda (American Honda Motor Co.),False,False,0,0,12/15/2023,03/27/2024,5FNYF6H02LB,FUEL/PROPULSION SYSTEM,We received a recall notification in December ...
29888,LINCOLN,AVIATOR,2020,11538273,Ford Motor Company,False,False,0,0,08/12/2023,08/13/2023,5LM5J7WC4LG,POWER TRAIN,While traveling on the freeway the transmissio...
25014,HYUNDAI,KONA ELECTRIC,2020,11585896,Hyundai Motor America,False,False,0,0,04/09/2024,04/29/2024,KM8K33AG8LU,FORWARD COLLISION AVOIDANCE,I am concerned about the safety of my car and ...
28731,KIA,TELLURIDE,2020,11623273,"Kia America, Inc.",False,False,0,0,03/12/2024,11/02/2024,5XYP3DHC3LG,SUSPENSION,Less than 40k miles and the rear suspension ha...
944,AUDI,Q8,2020,11578769,"Volkswagen Group of America, Inc.",False,False,0,0,03/31/2023,03/21/2024,WA1EVAF16LD,UNKNOWN OR OTHER,The door locking on the Audi Q8 is consistentl...
27320,JEEP,GRAND CHEROKEE TRACKHAWK,2020,11612956,"Chrysler (FCA US, LLC)",False,False,0,0,09/05/2024,09/06/2024,1C4RJFBG8LC,"ELECTRICAL SYSTEM,FORWARD COLLISION AVOIDANCE,...",I am reporting significant safety issues with ...
28558,KIA,SOUL,2020,11475959,"Kia America, Inc.",False,False,0,0,07/25/2022,07/26/2022,KNDJ23AU6L7,UNKNOWN OR OTHER,My car was stolen using a usb cord. The manufa...
2208,CHEVROLET,MALIBU,2020,11427291,"General Motors, LLC",False,False,0,0,06/01/2021,07/31/2021,1G1ZG5ST1LF,"VEHICLE SPEED CONTROL,SERVICE BRAKES,ENGINE","Speed reduced, reduced engine power , hard br..."
34645,RAM,1500 CLASSIC,2020,11315754,"Chrysler (FCA US, LLC)",False,False,0,0,02/29/2020,03/03/2020,1C6SRFJT3LN,STRUCTURE,AFTER TAKING THE TRUCK THROUGH A CAR WASH AND ...
34273,RAM,1500 CLASSIC,2020,11623579,"Chrysler (FCA US, LLC)",True,False,1,0,10/29/2024,11/05/2024,1C6RR6TTXLS,AIR BAGS,The contact owned a 2020 Ram 1500. The contact...


In [4]:
print(recalls_df.info())
recalls_df.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2373 entries, 0 to 2372
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   make                 2373 non-null   object
 1   model                2373 non-null   object
 2   modelYear            2373 non-null   object
 3   NHTSACampaignNumber  2373 non-null   object
 4   manufacturer         2373 non-null   object
 5   component            2373 non-null   object
 6   summary              2373 non-null   object
 7   consequence          2373 non-null   object
 8   remedy               2373 non-null   object
 9   notes                2373 non-null   object
 10  reportDate           2373 non-null   object
 11  affectedVehicles     0 non-null      object
dtypes: object(12)
memory usage: 222.6+ KB
None


Unnamed: 0,make,model,modelYear,NHTSACampaignNumber,manufacturer,component,summary,consequence,remedy,notes,reportDate,affectedVehicles
2309,VOLKSWAGEN,ATLAS,2020,22V815000,"Volkswagen Group of America, Inc.",TIRES:PRESSURE MONITORING AND REGULATING SYSTEMS,"Volkswagen Group of America, Inc. (Volkswagen)...",An undetected sudden loss of pressure in all f...,"Dealers will update the TPMS software, free of...",Owners may also contact the National Highway T...,31/10/2022,
1756,NISSAN,MURANO,2020,21V402000,"Nissan North America, Inc.",SUSPENSION:REAR:AXLE:SPINDLE,"Nissan North America, Inc. (Nissan) is recalli...",The steering knuckle or rear axle could deform...,Dealers will inspect and replace the front ste...,Owners may also contact the National Highway T...,27/05/2021,
756,FORD,TRANSIT CONNECT,2020,20V550000,Ford Motor Company,POWER TRAIN:AUTOMATIC TRANSMISSION,Ford Motor Company (Ford) is recalling certain...,Missing or loose bolts on the start stop accum...,"Ford will notify owners, and dealers will repl...",Owners may also contact the National Highway T...,10/09/2020,
1134,HEARTLAND,CYCLONE,2020,20V521000,"Heartland Recreational Vehicles, LLC",EQUIPMENT:RECREATIONAL VEHICLE/TRAILER,"Heartland Recreational Vehicles, LLC (Heartlan...","If the landing legs buckle, the trailer will d...","Heartland will notify owners, and dealers will...",Owners may also contact the National Highway T...,28/08/2020,
905,FOREST RIVER,CHEROKEE,2020,24V003000,"Forest River, Inc.",STRUCTURE:FRAME AND MEMBERS,"Forest River, Inc. (Forest River) is recalling...",A trailer that contacts the road during transi...,Dealers will replace the Ice Hose swing arm dr...,Owners may also contact the National Highway T...,05/01/2024,
803,FORD,EXPLORER,2020,20V788000,Ford Motor Company,ENGINE,Ford Motor Company (Ford) is recalling certain...,"With a loose motor mount, motor vibration may ...","Ford will notify owners, and dealers will remo...",Owners may also contact the National Highway T...,17/12/2020,
661,FIAT,500X,2020,19V909000,Chrysler (FCA US LLC),"SERVICE BRAKES, HYDRAULIC:FOUNDATION COMPONENT...",Chrysler (FCA US LLC) is recalling certain 202...,"A cracked right rear brake caliper can fail, r...","Chrysler will notify owners, and dealers will ...",Owners may also contact the National Highway T...,19/12/2019,
2316,VOLVO,VNL,2020,19V509000,Volvo Trucks North America,ELECTRICAL SYSTEM:WIRING:FUSES AND CIRCUIT BRE...,Volvo Trucks North America (Volvo Trucks) is r...,If the fuse blows the transmission may not shi...,"Volvo Trucks will notify owners, and dealers w...",Owners may also contact the National Highway T...,02/07/2019,
2062,TESLA,MODEL Y,2020,21V388000,"Tesla, Inc.",SEAT BELTS:REAR/OTHER:RETRACTOR,"Tesla, Inc. (Tesla) is recalling certain 2019-...",Improperly attached fasteners may prevent the ...,Tesla Service will inspect and replace both fa...,Owners may also contact the National Highway T...,25/05/2021,
1140,HIGHLAND RIDGE,OPEN RANGE,2020,21V430000,Highland Ridge RV,EQUIPMENT:RECREATIONAL VEHICLE/TRAILER:LPG SYS...,Highland Ridge RV (Highland) is recalling cert...,Increased propane pressure can enlarge the fla...,Dealers will replace the regulator and test th...,Owners may also contact the National Highway T...,04/06/2021,


## Data Quickload

In [5]:
complaints_df = pd.read_parquet('vehicle_complaints.parquet')
recalls_df = pd.read_parquet('vehicle_recalls.parquet')

## Visualizations

In [6]:
import plotly.graph_objects as go
import pandas as pd

# Load the complaints and recalls data
#complaints_df = pd.read_parquet("vehicle_complaints_2024.parquet")
#recalls_df = pd.read_parquet("vehicle_recalls_2024.parquet")

# Aggregate data for complaints
complaints_agg = complaints_df.groupby(['make', 'model']).size().reset_index(name='complaints_count')

# Aggregate data for recalls
recalls_agg = recalls_df.groupby(['make', 'model']).size().reset_index(name='recalls_count')

# Merge the two datasets
merged_data = pd.merge(complaints_agg, recalls_agg, on=['make', 'model'], how='outer').fillna(0)

# Sort by complaints and select top 10
top_data = merged_data.sort_values(by='complaints_count', ascending=False).head(10)

# Create the bar chart
fig = go.Figure()

# Add complaints bar
fig.add_trace(go.Bar(
    x=top_data['make'] + " " + top_data['model'],
    y=top_data['complaints_count'],
    name='Complaints',
    text=top_data['complaints_count'],
    textposition='auto'
))

# Add recalls bar
fig.add_trace(go.Bar(
    x=top_data['make'] + " " + top_data['model'],
    y=top_data['recalls_count'],
    name='Recalls',
    text=top_data['recalls_count'],
    textposition='auto'
))

# Update layout
fig.update_layout(
    title="Top 10 Makes and Models by Complaints with Recalls",
    xaxis_title="Make and Model",
    yaxis_title="Count",
    barmode='group',
    xaxis_tickangle=-45
)

# Show the plot
fig.show()


In [7]:
import pandas as pd
import plotly.graph_objects as go

# Aggregate data for complaints
complaints_agg = complaints_df.groupby(['make', 'model']).size().reset_index(name='complaints_count')

# Aggregate data for recalls
recalls_agg = recalls_df.groupby(['make', 'model']).size().reset_index(name='recalls_count')

# Merge the two datasets, filling missing recall counts with 0
merged_data = pd.merge(complaints_agg, recalls_agg, on=['make', 'model'], how='left').fillna({'recalls_count': 0})

# Create the scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=merged_data['complaints_count'],
    y=merged_data['recalls_count'],
    mode='markers',
    marker=dict(color='blue', opacity=0.7),
    name='Complaints vs Recalls'
))

# Update layout
fig.update_layout(
    title="Number of Complaints vs. Number of Recalls",
    xaxis_title="Number of Complaints",
    yaxis_title="Number of Recalls",
    template="plotly_white"
)

# Show the plot
fig.show()


In [8]:
# Aggregate data for complaints
complaints_agg = complaints_df.groupby(['make', 'model']).size().reset_index(name='complaints_count')

# Aggregate data for recalls and create binary recall indicator
recalls_agg = recalls_df.groupby(['make', 'model']).size().reset_index(name='recalls_count')
recalls_agg['any_recalls'] = (recalls_agg['recalls_count'] > 0).astype(int)

# Merge the two datasets, filling missing recall counts with 0
merged_data = pd.merge(complaints_agg, recalls_agg[['make', 'model', 'any_recalls']], on=['make', 'model'], how='left').fillna({'any_recalls': 0})

# Create the scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=merged_data['complaints_count'],
    y=merged_data['any_recalls'],
    mode='markers',
    marker=dict(color='blue', opacity=0.7),
    name='Complaints vs Any Recalls'
))

# Update layout
fig.update_layout(
    title="Likelihood of Any Recalls vs. Number of Complaints",
    xaxis_title="Number of Complaints",
    yaxis_title="Any Recalls (Binary)",
    template="plotly_white"
)

# Show the plot
fig.show()

## Recall Prediction

In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

# Aggregate data for recalls and create binary recall indicator
recalls_agg = recalls_df.groupby(['make', 'model']).size().reset_index(name='recalls_count')
recalls_agg['any_recalls'] = (recalls_agg['recalls_count'] > 0).astype(int)

# Merge the two datasets, filling missing recall counts with 0
merged_data = pd.merge(complaints_agg, recalls_agg[['make', 'model', 'any_recalls']], on=['make', 'model'], how='left').fillna({'any_recalls': 0})

# Prepare features and target
X = merged_data[['complaints_count']]
y = merged_data['any_recalls']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Baseline
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# XGBoost Model
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate performance
metrics = {
    "Accuracy": accuracy_score,
    "Precision": precision_score,
    "Recall": recall_score,
    "F1 Score": f1_score,
    "AUC-ROC": roc_auc_score
}

print("Logistic Regression Performance:")
for metric_name, metric_func in metrics.items():
    print(f"{metric_name}: {metric_func(y_test, y_pred_log):.4f}")

print("\nXGBoost Performance:")
for metric_name, metric_func in metrics.items():
    print(f"{metric_name}: {metric_func(y_test, y_pred_xgb):.4f}")


Logistic Regression Performance:
Accuracy: 0.5310
Precision: 0.5310
Recall: 1.0000
F1 Score: 0.6937
AUC-ROC: 0.5000

XGBoost Performance:
Accuracy: 0.5241
Precision: 0.5333
Recall: 0.8312
F1 Score: 0.6497
AUC-ROC: 0.5038
