In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Read the data
data = 'data.csv'
df = pd.read_csv(data)

# First five rows
display(df.head())

#Description
df.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307201,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090196,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


In [3]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Check for missing values
print(df.isnull().sum(), "\n")

# Explore categorical variables
print(df.dtypes)

X1    0
X2    0
X3    0
X4    0
X5    0
X6    0
X7    0
X8    0
Y1    0
Y2    0
dtype: int64 

X1    float64
X2    float64
X3    float64
X4    float64
X5    float64
X6      int64
X7    float64
X8      int64
Y1    float64
Y2    float64
dtype: object


In [4]:
# Split the data into independent variables (X) and dependent variables (Y1 and Y2)
X = df.drop(['Y1', 'Y2'], axis=1)
Y1 = df['Y1']
Y2 = df['Y2']

# Split the data into training, validation, and test sets
X_train, X_temp, Y1_train, Y1_temp = train_test_split(X, Y1, test_size=0.4, random_state=42)
X_val, X_test, Y1_val, Y1_test = train_test_split(X_temp, Y1_temp, test_size=0.5, random_state=42)

In [5]:
# Try different regression models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor()
}

In [6]:
best_model = None
best_rmse = float('inf')

for model_name, model in models.items():
    model.fit(X_train, Y1_train)
    Y1_val_pred = model.predict(X_val)
    rmse = sqrt(mean_squared_error(Y1_val, Y1_val_pred))
    
    print(f"{model_name} RMSE: {rmse}")
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model_name

print(f"\nBest Model: {best_model} with RMSE: {best_rmse}")

Linear Regression RMSE: 3.188472770085113
Ridge Regression RMSE: 3.296544157328567
Lasso Regression RMSE: 4.888970619395164
Decision Tree RMSE: 0.6521044852937755
Random Forest RMSE: 0.5380238211063892

Best Model: Random Forest with RMSE: 0.5380238211063892
