### Imports

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import xgboost as xgb
import regex as re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle
import os
import joblib

### Encoding

In [38]:
# Encoding functions
def one_hot_encode(data, columns):
    """One-hot encodes specified columns in the dataframe."""
    return pd.get_dummies(data, columns=columns, drop_first=True)

def label_encode(data, columns):
    """Label encodes specified columns in the dataframe."""
    for col in columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    return data

def frequency_encode(data, columns):
    """Frequency encodes specified columns in the dataframe."""
    for col in columns:
        value_counts = Counter(data[col])
        data[col] = data[col].map(lambda x: value_counts[x] / len(data))
    return data

def target_encode(data, target_column, columns):
    """Target encodes specified columns by replacing categories with the mean of the target variable."""
    for col in columns:
        target_mean = data.groupby(col)[target_column].mean()
        data[col] = data[col].map(target_mean)
    return data

### Imputation

In [39]:
# Imputation functions
def impute_mean(data, column):
    data[column].fillna(data[column].mean(), inplace=True)
    return data

def impute_median(data, columns):
    imputer = SimpleImputer(strategy="median")
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_mode(data, column):
    data[column].fillna(data[column].mode()[0], inplace=True)
    return data

def impute_knn(data, columns, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_random_forest(data, features, target_column):
    imputer = RandomForestRegressor()
    known_data = data.dropna(subset=[target_column])
    imputer.fit(known_data[features], known_data[target_column])
    missing_idx = data[target_column].isnull()
    data.loc[missing_idx, target_column] = imputer.predict(data.loc[missing_idx, features])
    return data

def impute_regression(data, columns):
    for col in columns:
        features = [c for c in data.columns if c != col]
        model = LinearRegression()
        model.fit(data[features], data[col])
        data.loc[data[col].isnull(), col] = model.predict(data[features].loc[data[col].isnull()])
    return data

### Evaluation

In [40]:
# Regression evaluation metrics
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}')
    return {'mse': mse, 'mae': mae, 'r2': r2}

# Classification evaluation metrics
def evaluate_classification(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print(f'Confusion Matrix:\n{cm}')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'confusion_matrix': cm}

### Settings

In [41]:
df = pd.read_csv('usedcarprices.csv')
filename = os.path.splitext(os.path.basename('usedcarprices.csv'))[0]
target_column = 'price'
problem_type = 'regression'.lower()

### Get Info

In [42]:
def dataset_summary(data):
    rows, cols = data.shape
    properties = {}
    print(f'{rows} rows, {cols} columns\n')

    for column in data.columns:
        null_count = int(data[column].isna().sum())
        null_percentage = round((null_count / rows) * 100, 2) if null_count > 0 else 0
        unique = data[column].nunique()
        dtype = data[column].dtype

        properties[column] = {
            'type': dtype,
            'null_count': null_count,
            'null_percentage': null_percentage,
            'unique': unique
        }
        print(f'{column}: {dtype}, {null_count} null ({null_percentage}%), {unique} unique')

    return properties

summary = dataset_summary(df)

188533 rows, 13 columns

id: int64, 0 null (0%), 188533 unique
brand: object, 0 null (0%), 57 unique
model: object, 0 null (0%), 1897 unique
model_year: int64, 0 null (0%), 34 unique
milage: int64, 0 null (0%), 6651 unique
fuel_type: object, 5083 null (2.7%), 7 unique
engine: object, 0 null (0%), 1117 unique
transmission: object, 0 null (0%), 52 unique
ext_col: object, 0 null (0%), 319 unique
int_col: object, 0 null (0%), 156 unique
accident: object, 2452 null (1.3%), 2 unique
clean_title: object, 21419 null (11.36%), 1 unique
price: int64, 0 null (0%), 1569 unique


### Transform

In [43]:
# Transformation function
def transform(data, target_column=None):
    properties = dataset_summary(data)
    new_df = data.copy()

    for column, prop in properties.items():
        # Skip the target column
        if column == target_column:
            le = LabelEncoder()
            new_df[column] = le.fit_transform(new_df[column])
            print(f'Label encoded target column {column}')
            continue

        # Drop columns with high null percentage
        if prop['null_percentage'] > 50:
            new_df.drop(column, axis=1, inplace=True)
            print(f'Dropped {column} due to high null percentage')
            continue

        # Convert numerical types to float
        if prop['type'] == 'int64':
            new_df[column] = new_df[column].astype(float)
            print(f'Converted {column} from int64 to float64')

        # Imputation
        if prop['null_count'] > 0:
            if prop['type'] == 'object':
                new_df = impute_mode(new_df, column)
                print(f'Imputed {column} with mode')
            else:
                new_df = impute_mean(new_df, column)
                print(f'Imputed {column} with mean')

        # Encoding
        if new_df[column].dtype == 'object':
            unique_count = prop['unique']
            if unique_count < 10:
                new_df = one_hot_encode(new_df, [column])
                print(f'One-hot encoded {column}')
            elif 10 <= unique_count <= 50:
                new_df = frequency_encode(new_df, [column])
                print(f'Frequency encoded {column}')
            elif target_column and target_column in new_df.columns:
                new_df = target_encode(new_df, target_column, [column])
                print(f'Target encoded {column}')
            else:
                new_df = frequency_encode(new_df, [column])
                print(f'Frequency encoded {column}')

    return new_df

new_df = transform(df, target_column)
summary = dataset_summary(new_df)

188533 rows, 13 columns

id: int64, 0 null (0%), 188533 unique
brand: object, 0 null (0%), 57 unique
model: object, 0 null (0%), 1897 unique
model_year: int64, 0 null (0%), 34 unique
milage: int64, 0 null (0%), 6651 unique
fuel_type: object, 5083 null (2.7%), 7 unique
engine: object, 0 null (0%), 1117 unique
transmission: object, 0 null (0%), 52 unique
ext_col: object, 0 null (0%), 319 unique
int_col: object, 0 null (0%), 156 unique
accident: object, 2452 null (1.3%), 2 unique
clean_title: object, 21419 null (11.36%), 1 unique
price: int64, 0 null (0%), 1569 unique
Converted id from int64 to float64
Target encoded brand
Target encoded model
Converted model_year from int64 to float64
Converted milage from int64 to float64
Imputed fuel_type with mode
One-hot encoded fuel_type
Target encoded engine
Target encoded transmission
Target encoded ext_col
Target encoded int_col
Imputed accident with mode
One-hot encoded accident
Imputed clean_title with mode
One-hot encoded clean_title
Label enc

### Modeling

In [44]:
def regression_model(data, target_column):
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    metrics = evaluate_regression(y_test, y_pred)
    return metrics, model

def classification_model(data, target_column):
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    metrics = evaluate_classification(y_test, y_pred)
    return metrics, model

# Example usage:
if problem_type == 'regression':
    print('Regression model evaluation: \n')
    metrics, model = regression_model(new_df, target_column)
    print(metrics)
elif problem_type == 'classification':
    print('Classification model evaluation: \n')
    metrics, model = classification_model(new_df, target_column)
    print(metrics)
elif problem_type == 'timeseries':
    pass

Regression model evaluation: 

Mean Squared Error: 59634.12763038693
Mean Absolute Error: 179.23544487760898
R^2 Score: 0.6676733277726907
{'mse': 59634.12763038693, 'mae': 179.23544487760898, 'r2': 0.6676733277726907}


In [47]:
# Method 1: Exporting as a pickle file
with open(f'{filename} model.pkl', 'wb') as file:
    pickle.dump(model, file)
print(f"Model saved as {filename} model.pkl")

# Method 2: Exporting as a joblib file
joblib.dump(model, f'{filename} model.joblib')
print(f"Model saved as {filename} model.joblib")

Model saved as usedcarprices model.pkl
Model saved as usedcarprices model.joblib


In [48]:
# Load the test data
test_df = pd.read_csv('test.csv')

# Preprocess the test data
test_df_transformed = transform(test_df)

# Make predictions
predictions = model.predict(test_df_transformed)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'id': test_df['id'], 'Depression': predictions})

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")

125690 rows, 12 columns

id: int64, 0 null (0%), 125690 unique
brand: object, 0 null (0%), 55 unique
model: object, 0 null (0%), 1891 unique
model_year: int64, 0 null (0%), 36 unique
milage: int64, 0 null (0%), 5700 unique
fuel_type: object, 3383 null (2.69%), 7 unique
engine: object, 0 null (0%), 1117 unique
transmission: object, 0 null (0%), 52 unique
ext_col: object, 0 null (0%), 317 unique
int_col: object, 0 null (0%), 156 unique
accident: object, 1632 null (1.3%), 2 unique
clean_title: object, 14239 null (11.33%), 1 unique
Converted id from int64 to float64
Frequency encoded brand
Frequency encoded model
Converted model_year from int64 to float64
Converted milage from int64 to float64
Imputed fuel_type with mode
One-hot encoded fuel_type
Frequency encoded engine
Frequency encoded transmission
Frequency encoded ext_col
Frequency encoded int_col
Imputed accident with mode
One-hot encoded accident
Imputed clean_title with mode
One-hot encoded clean_title
Predictions saved to predicti