In [97]:
# importing required libraries
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import mplcursors

In [3]:
all_data = pd.read_csv('train.csv')

In [4]:
all_data=all_data.dropna(subset=['year_of_vehicle_manufacture'])

In [5]:
all_data.columns

Index(['id', 'car_variant', 'year_of_vehicle_manufacture',
       'month_of_vehicle_manufacture', 'odometer_reading',
       'Odometer_Reading_Present', 'vehicle_fuel_type', 'registered_color',
       'vehicle_make', 'vehicle_model', 'accidental_vehicle', 'city',
       'car_valuation'],
      dtype='object')

In [6]:
def month_to_number(month):
    if pd.isna(month):
        return 6
    try:
        # Try to convert directly to an integer
        month_number = int(month)
        if 1 <= month_number <= 12:
            return month_number
    except ValueError:
        # If month is a string, convert it to lower case and map to month number
        month = month.lower()
        month_dict = {
            'january': 1, 'february': 2, 'march': 3,
            'april': 4, 'may': 5, 'june': 6,
            'july': 7, 'august': 8, 'september': 9,
            'october': 10, 'november': 11, 'december': 12
        }
        return month_dict.get(month, 6)  # Return 6 if month is not recognized
    return 6

In [7]:
# Apply the conversion to the 'month' column
all_data['month_of_vehicle_manufacture'] = all_data['month_of_vehicle_manufacture'].apply(month_to_number)

In [8]:
# Function to calculate age in months
def calculate_age_in_months(row, current_year, current_month):
    year, month = row['year_of_vehicle_manufacture'], row['month_of_vehicle_manufacture']
    return (current_year - year) * 12 + (current_month - month)

# Get current year and month
current_year = datetime.now().year
current_month = datetime.now().month

# Calculate 'ageInMonths' and add it as a new column
all_data['months_since_manufactured'] = all_data.apply(calculate_age_in_months, axis=1, current_year=current_year, current_month=current_month)

In [9]:
all_data = all_data.drop(columns=['id', 'month_of_vehicle_manufacture', 'year_of_vehicle_manufacture', 'vehicle_make', 'registered_color'], axis=1)

In [10]:
all_data.columns

Index(['car_variant', 'odometer_reading', 'Odometer_Reading_Present',
       'vehicle_fuel_type', 'vehicle_model', 'accidental_vehicle', 'city',
       'car_valuation', 'months_since_manufactured'],
      dtype='object')

In [11]:
def convert_columns_as_category(columns, data):
    for column in columns:
        data[column] = data[column].astype('category')

In [12]:
categorical_columns = ['car_variant', 'vehicle_fuel_type', 'vehicle_model', 'city', 'accidental_vehicle']
convert_columns_as_category(categorical_columns, all_data)

In [13]:
def plot_feature_importance(model, columns):
    feature_importances = model.feature_importances_
    print(feature_importances)
    
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': columns,
        'Importance': feature_importances
    })
    
    # Sort the DataFrame by importance
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    
    # Print the feature importances
    print(importance_df)
    
    # Visualize the feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
    plt.xlabel('Importance')
    plt.title('Feature Importances')
    plt.gca().invert_yaxis()
    plt.show()
    plt.close()

In [56]:
def evaulate_data(df, model):
    train_data = df.sample(frac = 0.8)
    test_data = df.drop(train_data.index)

    train_inputs = train_data.drop(columns=['car_valuation'], axis=1)
    train_output = train_data['car_valuation']

    test_inputs = test_data.drop(columns=['car_valuation'], axis=1)
    test_output = test_data['car_valuation']

    model.fit(train_inputs, train_output)

    # mean absolute error on train data
    train_prediction = model.predict(train_inputs)
    train_error = mean_absolute_error(train_prediction, train_output)

    # mean absolute error on test data
    test_predicted_output = model.predict(test_inputs)
    test_error = mean_absolute_error(test_predicted_output, test_output)

    # print("test_error= ",test_error," train_error= ", train_error)
    return test_error, train_error
    # plot_feature_importance(model, train_inputs.columns)

In [57]:
def plot_errors(x_axis, test_errors, train_errors):
    # Activate interactive plot in separate window
    %matplotlib qt
    
    # Create a figure and axis
    fig, ax = plt.subplots()
    
    # Plot the data
    ax.plot(x_axis, test_errors, label='Test Errors', marker='o')
    ax.plot(x_axis, train_errors, label='Train Errors', marker='o')
    
    # Add labels and title
    ax.set_xlabel('X Axis')
    ax.set_ylabel('Error')
    ax.set_title('Zoomable Graph with Test and Train Errors')
    
    # Enable interactive cursor for zooming
    mplcursors.cursor(hover=True)
    
    # Add legend
    plt.legend()
    
    # Display the plot
    plt.show()

In [58]:
# condition for more interested vehicle, which are non-accidental and has odometer reading
condition = (all_data['Odometer_Reading_Present'] != 0) & (all_data['accidental_vehicle'].isna())
data_more_liked = all_data[condition]
data_less_liked = all_data[~condition]
data_more_liked = data_more_liked.drop(columns=['Odometer_Reading_Present', 'accidental_vehicle'], axis=1)
data_less_liked = data_less_liked.drop(columns=['Odometer_Reading_Present', 'accidental_vehicle'], axis=1)

print('data_more_liked=', data_more_liked.shape)
print('data_less_liked=', data_less_liked.shape)

max_depths = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
num_estimators = [3, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100]

x_axis = []
test_errors = []
train_errors = []
for depth in max_depths:
    for learning_rate in learning_rates:
        for estimator_count in num_estimators:
            model_for_more_liked_data = XGBRegressor(enable_categorical=True, eval_metric='mae', booster='dart', max_depth=depth, learning_rate=learning_rate, n_estimators=estimator_count, random_state=42)
            test_error, train_error  = evaulate_data(data_more_liked, model_for_more_liked_data)
            if (test_error < 150000) :
                x_axis.append(f"{depth} {learning_rate} {estimator_count}")
                test_errors.append(test_error)
                train_errors.append(train_error)

plot_errors(x_axis, test_errors, train_errors)

data_more_liked= (344, 7)
data_less_liked= (158, 7)


In [161]:
model_for_more_liked_data = XGBRegressor(enable_categorical=True, eval_metric='mae', booster='dart', max_depth=8, learning_rate=0.525, n_estimators=20, random_state=42)
model_for_less_liked_data = XGBRegressor(enable_categorical=True, eval_metric='mae', booster='dart', max_depth=8, learning_rate=0.525, n_estimators=20, random_state=42)

evaulate_data(data_more_liked, model_for_more_liked_data)
evaulate_data(data_less_liked, model_for_less_liked_data)

(np.float64(173779.677734375), np.float64(53.39930555555556))

In [158]:
# return integer month, from string, default is 6
def month_to_number(month):
    if pd.isna(month):
        return 6
    try:
        # Try to convert directly to an integer
        month_number = int(month)
        if 1 <= month_number <= 12:
            return month_number
    except ValueError:
        # If month is a string, convert it to lower case and map to month number
        month = month.lower()
        month_dict = {
            'january': 1, 'february': 2, 'march': 3,
            'april': 4, 'may': 5, 'june': 6,
            'july': 7, 'august': 8, 'september': 9,
            'october': 10, 'november': 11, 'december': 12
        }
        return month_dict.get(month, 6)  # Return 6 if month is not recognized
    return 6

# Function to calculate count of months since manufacting data(year, month)
def calculate_months_since_manufactured(row, current_year, current_month):
    year, month = row['year_of_vehicle_manufacture'], row['month_of_vehicle_manufacture']
    return (current_year - year) * 12 + (current_month - month)
    
# Fuction to mark columns as category type
def convert_columns_as_category(columns, data):
    for column in columns:
        data[column] = data[column].astype('category')

# Convert values to uppercase
def convert_values_to_uppercase(columns, data):
    for column in columns:
        data[column] = data[column].str.strip()
        data[column] = data[column].str.upper()

# Replace text which is matching with a pattern
def replace_text(pattern, data, column):
    data[column] = data[column].str.replace(pattern, '', regex=True)

# Prune data
def prune_data(data):
    # set month number, considering string, int, nan cases 
    data['month_of_vehicle_manufacture'] = data['month_of_vehicle_manufacture'].apply(month_to_number)
    # add months_since_manufactured column
    data['months_since_manufactured'] = data.apply(calculate_months_since_manufactured, axis=1, current_year=datetime.now().year, current_month=datetime.now().month)
    # remove redundant features
    data = data.drop(columns=['month_of_vehicle_manufacture', 'year_of_vehicle_manufacture', 'registered_color'], axis=1)
    # convert values to uppercase
    columns_to_be_upper_cased = ['car_variant', 'vehicle_fuel_type', 'vehicle_make', 'vehicle_model', 'vehicle_make', 'city']
    convert_values_to_uppercase(columns_to_be_upper_cased, data)
    # remove yyyy-yyyy or yyyy from vehicle_model
    pattern = r'\b\d{4}-\d{4}\b|\b\d{4}\b'
    replace_text(pattern, data, 'vehicle_model')
    # mark columns as category type
    categorical_columns = ['car_variant', 'vehicle_fuel_type', 'vehicle_make', 'vehicle_model', 'city']
    convert_columns_as_category(categorical_columns, data)
    return data

# Data segregation basis on likeliness
def separate_on_the_basis_of_likeliness(data):
    # condition for more interested vehicle is non-accidental and has odometer reading
    condition = (data['Odometer_Reading_Present'] != 0) & (data['accidental_vehicle'].isna())
    data_more_liked = data[condition]
    data_less_liked = data[~condition]
    # remove redundant columns
    data_more_liked = data_more_liked.drop(columns=['Odometer_Reading_Present', 'accidental_vehicle'], axis=1)
    data_less_liked = data_less_liked.drop(columns=['Odometer_Reading_Present', 'accidental_vehicle'], axis=1)
    return data_more_liked, data_less_liked

# return ids and predicted outcome using model
def run_model(data, model):
    ids = data['id']
    data = data.drop(columns=['id'], axis=1)
    predicted_output = model.predict(data)
    return ids, predicted_output

# predict outcomes and save in file
def predict_output_separately_and_save(ids_original_order, data_more_liked, model_for_more_liked_data, data_less_liked, model_for_less_liked_data):
    ids_more_liked, prediction_for_more_liked_data = run_model(data_more_liked, model_for_more_liked_data)
    ids_less_liked, prediction_for_less_liked_data = run_model(data_less_liked, model_for_less_liked_data)
    ids = []
    for id_more_liked in ids_more_liked:
        ids.append(id_more_liked)
    for id_less_liked in ids_less_liked:
        ids.append(id_less_liked)
    
    car_valuations = []
    for valuation in prediction_for_more_liked_data:
        car_valuations.append(int(valuation))
    for valuation in prediction_for_less_liked_data:
        car_valuations.append(int(valuation))
    print(len(car_valuations))
    
    df = pd.DataFrame({
        'id': ids,
        'car_valuation': car_valuations
    })
    df.to_csv('car_valuation_data.csv', index=False)

def predict_output(data, model_for_more_liked_data, model_for_less_liked_data):
    ids = data['id']
    data = prune_data(data)
    data.to_csv('pruned_data.csv', index=False)
    data_more_liked, data_less_liked = separate_on_the_basis_of_likeliness(data)
    print(data_more_liked.shape, data_less_liked.shape)
    predict_output_separately_and_save(ids, data_more_liked, model_for_more_liked_data, data_less_liked, model_for_less_liked_data)

In [159]:
test_data = pd.read_csv('test.csv')
predict_output(test_data, model_for_more_liked_data, model_for_less_liked_data)

(48, 8) (22, 8)


ValueError: feature_names mismatch: ['car_variant', 'odometer_reading', 'vehicle_fuel_type', 'vehicle_model', 'city', 'months_since_manufactured'] ['car_variant', 'odometer_reading', 'vehicle_fuel_type', 'vehicle_make', 'vehicle_model', 'city', 'months_since_manufactured']
training data did not have the following fields: vehicle_make

In [49]:
evaulate_data(data_more_liked, model_for_more_liked_data)
evaulate_data(data_less_liked, model_for_less_liked_data)

test_error=  195406.97282608695  train_error=  15087.300227272728
test_error=  268460.466796875  train_error=  13984.654761904761


(np.float64(268460.466796875), np.float64(13984.654761904761))

In [None]:
train_data = all_data.sample(frac = 0.8)
test_data = all_data.drop(train_data.index)

In [None]:
train_inputs = train_data.drop(columns=['car_valuation'], axis=1)
train_output = train_data['car_valuation']

test_inputs = test_data.drop(columns=['car_valuation'], axis=1)
test_output = test_data['car_valuation']

In [None]:
train_inputs.columns

In [None]:
print(train_inputs.shape)
print(train_output.shape)
print(test_inputs.shape)
print(test_output.shape)

In [None]:
model = XGBRegressor(enable_categorical=True, booster='dart', max_depth=100, learning_rate=0.1, n_estimators=50)

model.fit(train_inputs, train_output)

In [None]:
# mean absolute error on test data
test_predicted_output = model.predict(test_inputs)
test_error = mean_absolute_error(test_predicted_output, test_output)

# mean absolute error on train data
train_prediction = model.predict(train_inputs)
train_error = mean_absolute_error(train_prediction, train_output)
print("test_error= ",test_error," train_error= ", train_error)

In [None]:
feature_importances = model.feature_importances_
print(feature_importances)

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': train_inputs.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(importance_df)

# Visualize the feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()



In [160]:
pwd

'/Users/vikash.yadav/Documents/data_science_hackathon'