## Nadav Mashiach
## Amit Stein
---

## Import Statements for Data Preparation and Elastic Net Modeling

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance


## Prepare the data, clean and parse

In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer

def prepare_data(raw_df, target='Price'):
    df = raw_df.copy()

    def clean_numeric(x):
        """Cleans numeric values in a DataFrame column."""
        if isinstance(x, str):
            return pd.to_numeric(x.replace(',', ''), errors='coerce')
        return x

    def clean_and_impute_numeric_columns(df, numeric_columns):
        """Cleans and imputes numeric columns."""
        for col in numeric_columns:
            df[col] = df[col].apply(clean_numeric)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df[col].median())
        return df
    
    def remove_manufactor_from_model(df):
        """Removes the manufactor name from the model column."""
        df['model'] = df.apply(lambda x: x['model'].replace(x['manufactor'], '').strip() if pd.notna(x['model']) and pd.notna(x['manufactor']) else x['model'], axis=1)
        return df
    
    def impute_categorical_columns(df):
        """Imputes categorical columns with a constant value."""
        categorical_columns = df.select_dtypes(include=['object']).columns
        categorical_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
        df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])
        return df

    def convert_date_columns(df, date_columns):
        """Converts specified columns to datetime."""
        for col in date_columns:
            df[col] = pd.to_datetime(df[col], format='%d/%m/%Y', errors='coerce')
        return df
    
    
    def remove_outliers(df, column):
        """Removes outliers from a specified numeric column using the IQR method."""
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    
    def create_season_column(df, date_column):
        """Creates a season column based on the month of a date column."""
        df['Season'] = df[date_column].dt.month.map({
            12: 'Winter', 1: 'Winter', 2: 'Winter',
            3: 'Spring', 4: 'Spring', 5: 'Spring',
            6: 'Summer', 7: 'Summer', 8: 'Summer',
            9: 'Fall', 10: 'Fall', 11: 'Fall'
        }).fillna('Unknown')
        return df
    
    def create_model_manufactor_feature(df):
        df['Model_Manufactor'] = df['model'].astype(str) + '_' + df['manufactor'].astype(str)
        return df    
    
    def create_derived_features(df):
        """Creates derived features such as Age, Km_per_year, and Age_Hand_interaction."""
        current_year = pd.Timestamp.now().year
        df['Age'] = current_year - df['Year']
        df['Age_Hand_interaction'] = df['Age'] * df['Hand']
        return df

    def drop_unnecessary_columns(df, columns_to_drop):
        """Drops unnecessary columns from the DataFrame."""
        return df.drop(columns=columns_to_drop, errors='ignore')

    # Define numeric columns
    numeric_columns = ['capacity_Engine', 'Km', 'Pic_num', 'Year', 'Hand', target]

    # Clean and impute numeric columns
    df = clean_and_impute_numeric_columns(df, numeric_columns)

    # Impute categorical columns
    df = impute_categorical_columns(df)
    
     # Remove outliers for 'Year'
    df = remove_outliers(df, 'Year')
    
     # Remove manufactor from model
    df = remove_manufactor_from_model(df)

    df = create_model_manufactor_feature(df)
    
    # Convert date columns
    date_columns = ['Cre_date', 'Repub_date']
    df = convert_date_columns(df, date_columns)

    # Create season column
    df = create_season_column(df, 'Cre_date')
    
    # Create derived features
    df = create_derived_features(df)

    # Drop unnecessary columns
    columns_to_drop = ['Cre_date', 'Repub_date', 'Description', 'Supply_score', 'Test']
    df = drop_unnecessary_columns(df, columns_to_drop)

    return df


#### Key Points:

1. **Data Cleaning and Imputation Functions**
    - **`def clean_numeric`**: This function is used to clean numeric data by removing non-numeric characters and converting them to numeric types.
    - **`def clean_and_impute_numeric_columns`**: This function cleans and imputes numeric columns by filling missing values with the median value of each column.
    - **`def impute_categorical_columns`**: This function imputes missing values in categorical columns with a constant value, 'Unknown', ensuring no missing values remain.
    - **`def remove_outliers`**: This function is used to remove outliers from a specified numeric column in a DataFrame using the Interquartile Range (IQR) method. The IQR method identifies outliers by calculating the range within which the middle 50% of the data lie (between the first and third quartiles) and then excluding data points that fall outside of 1.5 times this range. In this context, it was used specifically to remove outliers from the 'Year' column..


2. **Date Conversion and Season Column Creation**
    - **`def convert_date_columns`**: This function standardizes date formats to ensure consistency across the dataset. It addresses issues where date values may be in different formats.
    - **`def create_season_column`**: Recognizing that the original `Cre_date` column was not contributing effectively to the model, this function creates a new 'Season' column based on the month extracted from `Cre_date`.

3. **Derived Feature Creation**
    - **`def create_derived_features`**: Through analysis, we identified key columns that significantly impact the model's performance. This function creates new features based on these important columns, enhancing the model's predictive power.

4. **Dropping Unnecessary Columns**
    - **`def drop_unnecessary_columns`**: Based on the model's performance results and prior knowledge, this function removes columns that are deemed unnecessary. This step is crucial for improving the model's efficiency and reducing its complexity.


## Load the dataframe, and prepare the model

In [4]:
# Load data
raw_url = 'https://raw.githubusercontent.com/nadav52/Matala-2/main/dataset.csv'
df = pd.read_csv(raw_url, engine='python')

# Prepare data
prepared_df = prepare_data(df)

prepared_df.shape

(1485, 19)

In [5]:
prepared_df.head()

Unnamed: 0,manufactor,Year,model,Hand,Gear,capacity_Engine,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Color,Km,Model_Manufactor,Season,Age,Age_Hand_interaction
0,יונדאי,2015,i35,2,אוטומטית,1600.0,בנזין,פרטית,פרטית,רעננה - כפר סבא,רעננה,51000.0,2.0,כחול כהה מטאלי,144000.0,i35_יונדאי,Summer,9,18
1,ניסאן,2018,מיקרה,1,אוטומטית,1200.0,בנזין,פרטית,פרטית,מושבים בשרון,אבן יהודה,49000.0,0.0,כחול בהיר,69000.0,מיקרה_ניסאן,Spring,6,6
2,סוזוקי,2010,סוויפט,1,אוטומטית,1450.0,בנזין,Unknown,Unknown,רמת,רמת,22500.0,1.0,Unknown,145000.0,סוויפט_סוזוקי,Fall,14,14
3,טויוטה,2016,אוריס,1,טיפטרוניק,1600.0,בנזין,פרטית,פרטית,נס ציונה - רחובות,רחובות,63000.0,5.0,אפור מטאלי,27300.0,אוריס_טויוטה,Spring,8,8
4,קיה,2012,פיקנטו,1,אוטומטית,1248.0,בנזין,Unknown,Unknown,"ראשל""צ והסביבה",ראשון לציון,37000.0,1.0,Unknown,70000.0,פיקנטו_קיה,Summer,12,12



**After preprocessing, the dataset was refined to 1485 rows from an initial 1500, demonstrating improved data quality achieved through effective outlier handling and preparation steps, ensuring retention of valuable data points.**

In [6]:
# Define columns
cat_columns = ['manufactor', 'model', 'Gear', 'Engine_type', 'Area', 'City', 'Color', 'Prev_ownership', 'Curr_ownership','Model_Manufactor']
numeric_columns = ['capacity_Engine', 'Km', 'Pic_num', 'Year', 'Hand', 'Age', 'Age_Hand_interaction']


X = prepared_df.drop(columns=['Price'])
y = prepared_df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Defining Categorical and Numeric Columns

Defining `cat_columns` and `numeric_columns` before splitting ensures consistent preprocessing for both training and test sets, preserving feature engineering integrity.

#### Train-Test Split

- **`test_size=0.2`**: Uses 20% of the data for testing, balancing between sufficient training data and a reliable test set.
- **`random_state=42`**: Ensures reproducibility, with `42` being a common choice for consistency across runs.


In [7]:
# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_columns)
    ])

# Fit and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Get feature names
feature_names_cat = []
for i, col in enumerate(cat_columns):
    feature_names_cat.extend([f"{col}_{val}" for val in preprocessor.named_transformers_['cat'].categories_[i]])
feature_names = numeric_columns + feature_names_cat


#### Defining the Preprocessor

The `preprocessor` is defined to apply different transformations to numeric and categorical columns, ensuring the data is standardized and properly encoded for the model.

#### Fitting and Transforming the Data

- **`X_train_transformed` and `X_test_transformed`**: The preprocessor is fitted on the training data and then applied to both training and test sets, maintaining consistency in preprocessing.

#### Generating Feature Names

- **`feature_names_cat`**: Extracts and formats the feature names for the encoded categorical variables, combining them with the numeric feature names for a complete list.


In [8]:
# Define the model
model = ElasticNet(random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5],
    'l1_ratio': [0.3, 0.5, 0.7, 0.9]
}

# Perform GridSearchCV to find best parameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_transformed, y_train)

# Use best parameters to evaluate model using cross-validation on training set
best_model = grid_search.best_estimator_
cv = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(best_model, X_train_transformed, y_train, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)


#### Parameter Grid for GridSearchCV

- **`param_grid`**: Specifies the range of hyperparameters (`alpha` and `l1_ratio`) to search for optimal values. Hyperparameters are parameters whose values are set before the learning process begins and control the model's learning process.

#### Performing GridSearchCV

- **`grid_search`**: Uses 10-fold cross-validation to find the best hyperparameters, optimizing for negative mean squared error.

  - **`scoring='neg_mean_squared_error'`**: This scoring method evaluates models based on the negative mean squared error, with lower values indicating better model performance.
  - **`n_jobs=-1`**: Utilizes all available CPU cores for parallel processing, speeding up the computation.

#### Evaluating the Model

- **`best_model`**: The best estimator from `GridSearchCV`.
- **`cross_val_score`**: Evaluates the model using 10-fold cross-validation on the training set to ensure robust performance metrics.

  - **`KFold(n_splits=10, shuffle=True, random_state=42)`**: Splits the data into 10 folds for cross-validation, shuffling the data to ensure randomness and setting a seed for reproducibility.
  


This code optimizes an ElasticNet model through GridSearchCV for hyperparameter tuning and K-Fold cross-validation for performance assessment, aiming to balance bias and variance, reduce overfitting, and ensure robust performance on unseen data by systematically exploring combinations of alpha and l1_ratio to find the best model configuration.

In [9]:
# Fit model on training data using best parameters
best_model.fit(X_train_transformed, y_train)

# Predict on test set
y_pred = best_model.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)

In [10]:
# Calculate permutation importance using training set
importance = permutation_importance(best_model, X_train_transformed, y_train, n_repeats=10, random_state=42)
feature_importances = importance.importances_mean

# Create a dictionary for permutation importance
importance_dict = dict(zip(feature_names, feature_importances))
sorted_importances = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Get model coefficients
coefficients = best_model.coef_

This code calculates permutation importance and extracts model coefficients to identify influential features. Permutation importance measures each feature's impact on model performance by randomly shuffling its values and observing the resulting change in model error. Model coefficients indicate feature weights in the ElasticNet model. Together, these methods provide complementary insights into feature relevance, aiding in model interpretation and potential feature selection.


## Resultes

In [11]:
print(f'The RMSE: {np.sqrt(mse):.3f}')
  



# Print top 5 most influential features with scaled permutation importance
print("\nTop 5 most influential features:")
for feature, importance_value in sorted_importances[:5]:
    coefficient_index = feature_names.index(feature)
    coefficient_value = coefficients[coefficient_index]
    sign = "positive" if coefficient_value > 0 else "negative"
    print(f"{feature}: {importance_value:.3f}, Coefficient = {coefficient_value:.3f} ({sign} impact)")


The RMSE: 11056.732

Top 5 most influential features:
Age_Hand_interaction: 0.433, Coefficient = 10142.112 (positive impact)
Year: 0.387, Coefficient = 9675.055 (positive impact)
Age: 0.386, Coefficient = -9663.456 (negative impact)
Hand: 0.203, Coefficient = -7016.061 (negative impact)
capacity_Engine: 0.029, Coefficient = 2543.873 (positive impact)


---
The RMSE (Root Mean Squared Error) measures the average prediction error of the model, calculated here as 11056.73. Lower RMSE values indicate better model accuracy. RMSE value of 11490.55 indicates solid predictive accuracy for estimating car prices.

Top 5 Influential Features:
- **Age_Hand_interaction**: Positive impact (importance: 0.4328, Coefficient: 10142.1121). Indicates higher car prices with increased interaction between age and hand.
- **Year**: Negative impact (importance: 0.3866, Coefficient: 9675.0553). Suggests lower prices for cars with fewer previous owners.
- **Age**: Positive impact (importance: 0.3857, Coefficient: -9663.4561). Newer cars tend to have higher predicted prices.
- **Hand**: Negative impact (importance: 0.2027, Coefficient: -7016.0609). Older cars generally predict lower prices.
- **capacity_Engine**: Negative impact (Coefficient: 0.0286, Coefficient: 2543.8732). Higher mileage correlates with lower predicted prices.

These insights help understand how each feature influences car price predictions in the model.
