In [7]:
import pandas as pd
import os
# Load the train data
cwd = os.getcwd()
train_data = pd.read_csv(os.path.join(cwd, 'data', 'train.csv'))
# Print the first few rows of the data
print("Data Preview:")
print(train_data.head())
# Get the data description
data_description = f"Data Description:\n"
data_description += f"Number of rows: {train_data.shape[0]}\n"
data_description += f"Number of columns: {train_data.shape[1]}\n"

# Get data types for each column
data_types = "Data Types:\n"
data_types += str(train_data.dtypes) + "\n"
# Check for missing values
missing_values = "Missing Values:\n"
missing_values += str(train_data.isnull().sum()) + "\n"

# Check for unique values in each column
unique_values = "Unique Values in Each Column:\n"
for col in train_data.columns:
    unique_values += f"Column: {col}, Unique Values: {train_data[col].nunique()}\n"

# Check for categorical columns
categorical_cols = [col for col in train_data.columns if train_data[col].dtype == 'object']
categorical_cols_str = "Categorical Columns:\n"
categorical_cols_str += str(categorical_cols) + "\n"

# Check for numerical columns
numerical_cols = [col for col in train_data.columns if train_data[col].dtype != 'object']
numerical_cols_str = "Numerical Columns:\n"
numerical_cols_str += str(numerical_cols) + "\n"

# Check for imbalanced classes (if target variable is categorical)
target_col = 'target'  # assume the target variable is named 'target'
if target_col in train_data.columns and train_data[target_col].dtype == 'object':
    class_distribution = "Class Distribution:\n"
    class_distribution += str(train_data[target_col].value_counts()) + "\n"

# Generate the info.txt file
with open('info.txt', 'w') as f:
    f.write("Machine Learning Problem Information:\n\n")
    f.write("This file contains information about the machine learning problem, including data description, data types, missing values, unique values, categorical and numerical columns, and class distribution (if applicable).\n\n")
    f.write(data_description)
    f.write(data_types)
    f.write(missing_values)
    f.write(unique_values)
    f.write(categorical_cols_str)
    f.write(numerical_cols_str)
    if target_col in train_data.columns and train_data[target_col].dtype == 'object':
        f.write(class_distribution)

print("Info.txt file generated successfully!")


Data Preview:
   id    brand          model  model_year  milage fuel_type  \
0   0     Ford   F-150 Lariat        2018   74349  Gasoline   
1   1      BMW          335 i        2007   80000  Gasoline   
2   2   Jaguar      XF Luxury        2009   91491  Gasoline   
3   3      BMW   X7 xDrive40i        2022    2437    Hybrid   
4   4  Pontiac  Firebird Base        2001  111000  Gasoline   

                                              engine  \
0      375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel   
1  300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
2       300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel   
3  335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
4      200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col       accident clean_title  \
0                    10-Speed A/T    Blue    Gray  None reported         Yes   
1                     6-Speed M/T   Black   Black  None reported         Yes   
2                     6-Speed 

In [68]:
import pandas as pd
import os
import json

# Load the train data
cwd = os.getcwd()
train_data = pd.read_csv(os.path.join(cwd, 'data', 'train.csv'))
# Print the first few rows of the data
print("Data Preview:")
print(train_data.head())
# Get the data description
data_description = {
    "Machine Learning Problem Information": "This file contains information about the machine learning problem, including data description, data types, missing values, unique values, categorical and numerical columns, and class distribution (if applicable).",
    "Data Description": {
        "number_of_rows": train_data.shape[0],
        "number_of_columns": train_data.shape[1]
    }
}
# Get data types for each column
data_types = {
    "Data Types": "This section describes the data types for each column.",
    "types": {col: str(dtype) for col, dtype in zip(train_data.columns, train_data.dtypes)}
}

missing_values = "Missing Values:\n"
missing_values += str(train_data.isnull().sum()) + "\n"

# Check for missing values
missing_values = {
    "Missing Values": "This section describes the number of missing values in each column.",
    "counts": {col: sum(train_data[col].isnull()) for col in train_data.columns}
}
# Check for unique values in each column
unique_values = {
    "Unique Values in Each Column": "This section describes the number of unique values in each column.",
    "counts": {col: train_data[col].nunique() for col in train_data.columns}
}
# Check for categorical columns
categorical_cols = [col for col in train_data.columns if train_data[col].dtype == 'object']
categorical_cols_info = {
    "Categorical Columns": "This section lists the categorical columns in the dataset.",
    "columns": categorical_cols
}
# Check for numerical columns
numerical_cols = [col for col in train_data.columns if train_data[col].dtype != 'object']
numerical_cols_info = {
    "Numerical Columns": "This section lists the numerical columns in the dataset.",
    "columns": numerical_cols
}
# Check for imbalanced classes (if target variable is categorical)
target_col = 'price'  # assume the target variable is named 'target'
if target_col in train_data.columns and train_data[target_col].dtype == 'object':
    class_distribution = {
        "Class Distribution": "This section describes the class distribution of the target variable.",
        "distribution": {cls: count for cls, count in train_data[target_col].value_counts().items()}
    }
# Generate the info.json file
info_dict = {
    **data_description,
    **data_types,
    **missing_values,
    "Unique Values in Each Column": unique_values,
    "Categorical Columns": categorical_cols_info,
    "Numerical Columns": numerical_cols_info
}

if target_col in train_data.columns and train_data[target_col].dtype == 'object':
    info_dict["Class Distribution"] = class_distribution
with open('info.json', 'w') as f:
    json.dump(info_dict, f, indent=4)

print("Info.json file generated successfully!")    


Data Preview:
   id    brand          model  model_year  milage fuel_type  \
0   0     Ford   F-150 Lariat        2018   74349  Gasoline   
1   1      BMW          335 i        2007   80000  Gasoline   
2   2   Jaguar      XF Luxury        2009   91491  Gasoline   
3   3      BMW   X7 xDrive40i        2022    2437    Hybrid   
4   4  Pontiac  Firebird Base        2001  111000  Gasoline   

                                              engine  \
0      375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel   
1  300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
2       300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel   
3  335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
4      200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col       accident clean_title  \
0                    10-Speed A/T    Blue    Gray  None reported         Yes   
1                     6-Speed M/T   Black   Black  None reported         Yes   
2                     6-Speed 

In [69]:
import json
import math

def get_evaluation_metric(info_json, target_variable):
    # Load the info.json file
    with open(info_json, 'r') as f:
        info = json.load(f)

    # Get the data types of the columns
    data_types = info['types']

    # Check if the target variable is numerical or categorical
    if data_types[target_variable] == 'int64' or data_types[target_variable] == 'float64':
        # Numerical target variable, suggest evaluation metrics
        evaluation_metrics = ['Mean Squared Error (MSE)', 'Mean Absolute Error (MAE)', 'R-Squared', 'Root Mean Squared Error (RMSE)']
    else:
        # Categorical target variable, suggest evaluation metrics
        evaluation_metrics = ['Accuracy', 'F1-Score', 'Precision', 'Recall']

    # Create a dictionary to store the results
    results = {
        'Evaluation Metrics': evaluation_metrics,
        'Target Variable': target_variable,
        'Data Type': data_types[target_variable]
    }

    # Save the results to a new JSON file
    with open('evaluation_metrics.json', 'w') as f:
        json.dump(results, f, indent=4)

    print("Results saved to evaluation_metrics.json")




In [41]:
# Example usage
get_evaluation_metric('info.json', 'price')

Results saved to evaluation_metrics.json


In [73]:
import json
import pandas as pd
# Load info.json file
with open('info.json') as f:
    info_data = json.load(f)
# Load evaluation_metrics.json file
with open('evaluation_metrics.json') as f:
    eval_metrics_data = json.load(f)
# Extract essential information from info.json
num_rows = info_data['Data Description']['number_of_rows']
num_cols = info_data['Data Description']['number_of_columns']
data_types = info_data['types']
missing_values = info_data['counts']
unique_values = info_data['Unique Values in Each Column']
categorical_cols = info_data['Categorical Columns']['columns']
numerical_cols = info_data['Numerical Columns']['columns']
# Extract essential information from evaluation_metrics.json
target_variable = eval_metrics_data['Target Variable']
target_data_type = eval_metrics_data['Data Type']
evaluation_metrics = eval_metrics_data['Evaluation Metrics']
# Infer problem type based on target variable data type
if target_data_type == 'int64' or target_data_type == 'float64':
    problem_type = 'regression'
else:
    problem_type = 'classification'
# Ask for optional problem type input
optional_problem_type = input("Is this a regression problem or a classification problem? (optional, default is inferred) ")
if optional_problem_type:
    problem_type = optional_problem_type
# Decide on specific metric importance based on evaluation metrics
specific_metric_importance = 'all metrics are equally importance'
if 'Mean Squared Error (MSE)' in evaluation_metrics:
    specific_metric_importance = 'Mean Squared Error (MSE) is the most important metric'
elif 'Mean Absolute Error (MAE)' in evaluation_metrics:
    specific_metric_importance = 'Mean Absolute Error (MAE) is the most important metric'

# Decide on important features based on data types and missing values
important_features = [col for col in data_types if data_types[col] != 'object' and missing_values[col] == 0]
# Decide on missing value strategy based on missing values counts
if max(missing_values.values()) > 0:
    missing_value_strategy = 'imputation with mean/median'
else:
    missing_value_strategy = 'no missing values, no strategy needed'
# Decide on data distribution assumptions based on data types
data_distribution_assumptions = 'normality assumed'
if any(col in categorical_cols for col in data_types):
    data_distribution_assumptions = 'non-normality assumed due to categorical features'
# Decide on desired outcome based on target variable and problem type
desired_outcome = 'predict the target  variable with high accuracy'
if problem_type == 'regression':
    desired_outcome = 'predict the continuous target variable with high accuracy'
else:
    desired_outcome = 'predict the categorical target variable with high accuracy'
# Create full JSON prompt with added comments
prompt = {
    # Problem type inferred based on target variable data type
    "problem_type": problem_type,
    # Metrie decided based on evaluation metrics
    "specific_metric_importance": specific_metric_importance,
    # Important features decided based on data types and missing values
    "important_features": important_features,
    # Missing value strategy decided based on missing values counts
    "missing_value_strategy": missing_value_strategy,
    # Data distribution assumptions decided based on data types
    "data_distribution_assumptions": data_distribution_assumptions,
    # Desired outcome decided based on target variable and problem type
    "desired_outcome": desired_outcome,
    # Information from evaluation_metrics.json
    "target_variable": target_variable,
    "target_data_type": target_data_type,
    "evaluation_metrics": evaluation_metrics,
    # Information from info.json
    "num_rows": num_rows,
    "num_cols": num_cols,
    "data_types": data_types,
    "missing_values": missing_values,
    "unique_values": unique_values,
    "categorical_cols": categorical_cols,
    "numerical_cols": numerical_cols,
    "info": {
        # Data description section
        "Data Description": info_data["Data Description"],
        # Data types section
        "Data Types": info_data["Data Types"],
        # Missing values section
        "Missing Values": info_data["Missing Values"],
        # Unique values section
        "Unique Values in Each Column": info_data["Unique Values in Each Column"],
        # Categorical columns section
        "Categorical Columns": info_data["Categorical Columns"],
        # Numerical columns section
        "Numerical Columns": info_data["Numerical Columns"]
    },
    # Evaluation metrics section
    "evaluation_metrics": eval_metrics_data
}
# Save the JSON prompt to a file
with open('output.json', 'w') as f:
    json.dump(prompt, f, indent=4)
print("Full JSON prompt:")
print(json.dumps(prompt, indent=4))


Full JSON prompt:
{
    "problem_type": "regression",
    "specific_metric_importance": "Mean Squared Error (MSE) is the most important metric",
    "important_features": [
        "id",
        "model_year",
        "milage",
        "price"
    ],
    "missing_value_strategy": "no missing values, no strategy needed",
    "data_distribution_assumptions": "non-normality assumed due to categorical features",
    "desired_outcome": "predict the continuous target variable with high accuracy",
    "target_variable": "price",
    "target_data_type": "int64",
    "evaluation_metrics": {
        "Evaluation Metrics": [
            "Mean Squared Error (MSE)",
            "Mean Absolute Error (MAE)",
            "R-Squared",
            "Root Mean Squared Error (RMSE)"
        ],
        "Target Variable": "price",
        "Data Type": "int64"
    },
    "num_rows": 54273,
    "num_cols": 13,
    "data_types": {
        "id": "int64",
        "brand": "object",
        "model": "object",
      

In [77]:
import json
import pandas as pd
import os

# Load info.json file
with open('info.json') as f:
    info_data = json.load(f)

# Load evaluation_metrics.json file
with open('evaluation_metrics.json') as f:
    eval_metrics_data = json.load(f)

# Function to generate prompt based on provided data and target variable
def generate_prompt(target_variable, target_data_type):
    # Extract essential information from info.json
    num_rows = info_data['Data Description']['number_of_rows']
    num_cols = info_data['Data Description']['number_of_columns']
    data_types = info_data['types']
    missing_values = info_data['counts']
    categorical_cols = [col for col, dtype in data_types.items() if dtype == 'object']
    numerical_cols = [col for col, dtype in data_types.items() if dtype in ['int64', 'float64']]
    
    # Extract essential information from evaluation_metrics.json
    evaluation_metrics = eval_metrics_data['Evaluation Metrics']

    # Infer problem type based on target variable data type
    if target_data_type == 'int64' or target_data_type == 'float64':
        problem_type = 'regression'
    else:
        problem_type = 'classification'

    # Construct the prompt
    prompt = f"""
    You are tasked with recommending the most suitable machine learning methods to solve the following problem:

    ## Problem Description
    We are working on a {problem_type} problem to predict the {target_variable}. The dataset contains various features related to the data. The goal is to build a model that can accurately predict the {target_variable} based on these features.

    ## Dataset Information
    - Number of Rows: {num_rows}
    - Number of Columns: {num_cols}
    - Target Variable: {target_variable} (data type: {target_data_type})
    - Data Types: {data_types}
    - Missing Values: {missing_values}
    - Categorical Columns: {categorical_cols}
    - Numerical Columns: {numerical_cols}

    ## Evaluation Metrics
    The performance of the model will be evaluated using the following metrics:
    - {'\n- '.join(evaluation_metrics)}

    ## Specific Requirements and Constraints
    - Handle missing values appropriately. 
    - The target variable is {target_variable}, which is of type {target_data_type}.
    - Some features are categorical and will need to be encoded properly.
    - There might be non-linear relationships between the features and the target variable.

    ## Additional Information
    - Important features to consider include numerical columns without missing values and categorical columns.
    - Assume normality for numerical features but be cautious of non-normality due to categorical features.

    ## Task
    Recommend the most suitable machine learning methods and preprocessing steps to solve this {problem_type} problem, considering the given dataset, requirements, and evaluation metrics. Provide a detailed explanation for your recommendations, including:
    1. Data preprocessing steps to handle missing values and encode categorical variables.
    2. Feature selection or extraction methods.
    3. Suitable {problem_type} algorithms.
    4. Model evaluation techniques and any potential model improvement strategies.
    """

    return prompt

# Example usage with a specific target variable
target_variable = 'price'
target_data_type = 'int64'
prompt = generate_prompt(target_variable, target_data_type)

print("Full prompt:")
print(prompt)


Full prompt:

    You are tasked with recommending the most suitable machine learning methods to solve the following problem:

    ## Problem Description
    We are working on a regression problem to predict the price. The dataset contains various features related to the data. The goal is to build a model that can accurately predict the price based on these features.

    ## Dataset Information
    - Number of Rows: 54273
    - Number of Columns: 13
    - Target Variable: price (data type: int64)
    - Data Types: {'id': 'int64', 'brand': 'object', 'model': 'object', 'model_year': 'int64', 'milage': 'int64', 'fuel_type': 'object', 'engine': 'object', 'transmission': 'object', 'ext_col': 'object', 'int_col': 'object', 'accident': 'object', 'clean_title': 'object', 'price': 'int64'}
    - Missing Values: {'id': 0, 'brand': 0, 'model': 0, 'model_year': 0, 'milage': 0, 'fuel_type': 0, 'engine': 0, 'transmission': 0, 'ext_col': 0, 'int_col': 0, 'accident': 0, 'clean_title': 0, 'price': 0}
 