In [1]:
import pandas as pd
import numpy as np
import glob
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
from plotly.subplots import make_subplots
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['axes.grid'] = False

### Importing Dataset

In [None]:
# first we are importing control group dataset
directory1 = 'C:/Users/Rizwan/Desktop/DDM/empathy/data/control_group'

#loading all the csv files
files_path1 = glob.glob(directory1 + "/*.csv")
all_dfss = []

for file in files_path1:
    df = pd.read_csv(file)
    all_dfss.append(df)

control_group = pd.concat(all_dfss, ignore_index=True)

print(control_group)


In [None]:
control_group.shape

we have total 157050 rows and 71 columns

In [None]:
control_group.info()

In [None]:
control_group.describe()

In [None]:
# creating a copy of control_group dataframe
df1 = control_group.copy()

In [None]:
df1.head()

In [None]:
## converting all the datatypes with the actual data types


# Converting these columns into datetime their unit are milisecond
ml_date_cols = ['Recording timestamp', 'Computer timestamp', 'Recording duration', 'Eyetracker timestamp']

for col in ml_date_cols:
    df1[col] = pd.to_datetime(df1[col], unit='ms')


# Converting these columns into datetime 
date_cols = ['Export date', 'Recording date', 'Recording date UTC', 'Recording start time', 'Recording start time UTC']

for col in date_cols:
    df1[col] = pd.to_datetime(df1[col])


# Converting these columns into float and replace the (,) with the decimal point (.)

split_cols = ['Gaze direction left X','Gaze direction left Y','Gaze direction left Z','Gaze direction right X','Gaze direction right Y','Gaze direction right Z',
           'Pupil diameter left','Pupil diameter right','Eye position left X (DACSmm)','Eye position left Y (DACSmm)','Eye position left Z (DACSmm)',
           'Eye position right X (DACSmm)','Eye position right Y (DACSmm)','Eye position right Z (DACSmm)','Gaze point left X (DACSmm)',
           'Gaze point left Y (DACSmm)','Gaze point right X (DACSmm)','Gaze point right Y (DACSmm)','Gaze point X (MCSnorm)','Gaze point Y (MCSnorm)',
           'Gaze point left X (MCSnorm)','Gaze point left Y (MCSnorm)','Gaze point right X (MCSnorm)','Gaze point right Y (MCSnorm)',
           'Fixation point X (MCSnorm)','Fixation point Y (MCSnorm)'
       ]

for column in split_cols:
    df1[column] =df1[column].str.replace(',', '.').astype(float)
    

In [None]:
df1.head()

In [None]:
df1.info()

all data types are converted to their respective data Because this is the tie series dataset so we need to set the index to Recording timestamp

In [None]:
# Set the index to Recording timestamp because this is the time series dataset
df1.set_index('Recording timestamp', inplace=True)

In [None]:
df1.head()

I did this below part in assignment 1 to remove those columns which are constant  here is the GIT HUB LINK 
https://github.com/rizwannathani/EmpathyWork/blob/02037b141959633e8d823fab2d9c3e414ee9ad58/DDM%20Assignment%201.ipynb

In [None]:
#Removing these columns which has no change
df1 = df1.drop(['Project name', 'Export date','Recording date','Computer timestamp','Recording date','Recording date UTC','Recording start time'
                ,'Recording start time UTC','Timeline name','Event', 'Event value', 'Mouse position X', 'Mouse position Y','Recording Fixation filter name',
                'Recording software version', 'Recording resolution height', 'Eyetracker timestamp', 'Recording resolution width', 'Recording monitor latency','Unnamed: 0'], axis=1)
df1.head()

### Finding the Nan values

In [None]:
# making a list of all the nan values which are present in out dataset
na_features = [features for features in df1.columns if df1[features].isnull().sum()>1]

for features in na_features:
    print(features, '=', np.round(df1[features].isnull().mean(),2)*100, '% Missing Values')

no features are having more than 80% nan values

In [None]:
#we are doing imputation to remove the missing data by backward and forward fill technique
df1.interpolate(method='ffill', inplace=True)
df1.interpolate(method='bfill', inplace=True)

In [None]:
# lets check again how many total missing data we have

miss_count = df1.isna().sum()
total_na = miss_count.sum()
print(f'Total NaN values: {total_na}')


### Finding numerical features

In [None]:
#finding the columns which are having numerical data
numerical_features = [feature for feature in df1.columns if df1[feature].dtypes !='O']

print('Number of Numerical Columns: ', len(numerical_features))

df1[numerical_features].head()

### Finding categorical features

In [None]:
#finding the columns which are having non numeric data
categorical_features=[feature for feature in df1.columns if df1[feature].dtypes=='O']
categorical_features

In [None]:
df1[categorical_features].head()

In [None]:
#finding how many categories we have in a categorical columns
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,len(df1[feature].unique())))

In [None]:
#let find the frequecny of eye type movement we have 
eye_movement_counts = df1['Eye movement type'].value_counts()

plt.pie(eye_movement_counts, labels=None, autopct='%1.1f%%')

plt.legend(eye_movement_counts.index, title='Eye Movement Types', loc='center left', bbox_to_anchor=(1.1, 0.5))

plt.title('Percentage of Eye Movement Types')

plt.show()


In [None]:
#finding the column name
df1.columns

Due to the complexity of the data there are so many rows for each participants respect to each trials (i.e Recording name). We will take a median of the numerical columns to deal with the outliers 

In [None]:
# Exclude the columns ['Recording duration'] from numerical features
numerical_features = [col for col in numerical_features if col not in ['Recording duration']]

# Create a new dataframe to store the median values with participant name and recording name
median_df = pd.DataFrame(columns=['Participant name', 'Recording name'] + numerical_features)

# Group by ['Participant name'] and each ['Recording name']
grouped_df = df1.groupby(['Participant name', 'Recording name'])

# Calculate the median of each numerical column for each group
for name, group in grouped_df:
    medians = [np.median(group[col]) for col in numerical_features]
    medians.insert(0, name[0]) # Participant name
    medians.insert(1, name[1]) # Recording name
    median_df.loc[len(median_df)] = medians


median_df.head(15)


now we have a dataframe contain all the median values of numerical features with respect to the each participant and recording name now we do counts for the categorical features

In [None]:
df = df1.copy()
# defining the categorical features 
categorical_columns = ['Sensor', 'Validity left', 'Validity right', 'Presented Stimulus name', 'Presented Media name','Eye movement type']

# Define the features to group by
group_by_columns = ['Participant name', 'Recording name']

# making a new DataFrame to store the counts
counts_df = pd.DataFrame()

for col in categorical_columns:
    col_counts = df.groupby(group_by_columns)[col].value_counts().unstack(fill_value=0)
    col_counts.columns = [f"{col}_{val}" for val in col_counts.columns]
    counts_df = pd.concat([counts_df, col_counts], axis=1)

counts_df = counts_df.reset_index()

# Merge the counts_df DataFrame with the median_df DataFrame
final_df1 = pd.merge(median_df, counts_df, on=['Participant name', 'Recording name'])


In [None]:
final_df1.head(20)

Now we have our final dataframe which has all the median values of numerical data and the count of categorical data now we need to add a empathy score in this datafram for making of our model

In [None]:
final_df = final_df1.copy()

In [None]:
# Define regular expression to extract numbers
pattern = re.compile(r'\d+')

# Extract numbers from strings and create new column
final_df1['Participant name'] = final_df1['Participant name'].apply(lambda x: int(pattern.search(x).group()))
final_df1 = final_df1.rename(columns={'Participant name': 'Participant no'})


# Output result
final_df1.head()


Now we need to add a empathy score to our final dataframe because we have 2 questions dataset 1A and 1B and both have different empathy score we will take an average of these 2 scores and then add the average score in our final datafarme with respect to each particpant and recording(i.e trail)

In [None]:
questions1A = pd.read_csv('C:/Users/Rizwan/Desktop/DDM/empathy/data/questions/Questionnaire_datasetIA.csv', encoding= 'unicode_escape')
questions1B = pd.read_csv('C:/Users/Rizwan/Desktop/DDM/empathy/data/questions/Questionnaire_datasetIB.csv', encoding= 'unicode_escape')

# calculate the average total score extended from both files
avg_emp_scores = pd.concat([questions1A, questions1B])\
    .groupby(['Participant nr'])['Total Score extended']\
    .mean()\
    .reset_index()\
    .rename(columns={'Total Score extended': 'Avg Empathy score'})

# create a new dataframe score_df with the average total score extended
score_df = pd.merge(questions1A[['Participant nr']], avg_emp_scores, on='Participant nr', how='left')
score_df = score_df.rename(columns={'Participant nr': 'Participant no'})


In [None]:
score_df.head()

Here we have got the average Empathy score with respect to each participant now will going to add this in our final dataframe

In [None]:
final_df1 = final_df1.merge(score_df, on='Participant no', how='left')

In [None]:
final_df1.head(20)

Now we have complete one dataframe which has all the median values and counts of categorical data and also the average of the empathy score will use this dataframe to create our model

In [None]:
#saving this file to csv
final_df1.to_csv("finaldata.csv")

In [None]:
#raeding that file
finaldf1 = pd.read_csv("finaldata.csv")
finaldf1.head()

In [None]:
# removing first index
finaldf1 = finaldf1.drop(finaldf1.columns[0], axis=1)


In [None]:
finaldf1.head()

## Feature Selection

In [None]:
# Separate target variable and features
X = finaldf1.drop(['Avg Empathy score','Participant no','Recording name'], axis=1)
y = finaldf1['Avg Empathy score']

# Select top 15 features based on correlation with avg empathy score variable
selector = SelectKBest(f_regression, k=15)
X_selected = selector.fit_transform(X, y)
mask = selector.get_support() 
selected_feat = X.columns[mask] 

print("Selected Features:", selected_feat)


In [None]:
X = df[selected_feat]

## Applying Regression Models

In [None]:
df = finaldf1.copy()

# Split data into training and testing sets will use selected_feat which we found before
X_train, X_test, y_train, y_test = train_test_split(df[selected_feat], df['Avg Empathy score'], test_size=0.2, random_state=42)

# making multiple regression models using cross-validation
models = {'Linear Regression': LinearRegression(),
          'Ridge Regression': Ridge(),
          'Lasso Regression': Lasso(),
          'Random Forest Regression': RandomForestRegressor()}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f'{name}:')
    print(f'R-squared scores: {scores}')
    print(f'Mean R-squared score: {scores.mean():.3f}')
    print('')

# Output the best model based on cross-validation scores
best_model = max(models, key=lambda x: cross_val_score(models[x], X_train, y_train, cv=5, scoring='r2').mean())
print(f'Best model: {best_model}')


with the Cross validation approach for model evaluation we got Lasso Regression as the best model with Mean R-squared score of 0.689

### Adding some more important features

Because in research paper it was mentioned that the pupil diameter is important so we will add pupil diameter faeture and also the count of fixation occur because fixation occurs the most in the eye movement

In [None]:
df = finaldf1.copy()
selected_feat1 = ['Gaze point Y', 'Gaze point left Y', 'Gaze point right Y',
       'Gaze direction left Y', 'Gaze direction right Y',
       'Eye position left X (DACSmm)', 'Eye position left Y (DACSmm)',
       'Eye position right X (DACSmm)', 'Eye position right Y (DACSmm)',
       'Gaze point left Y (DACSmm)', 'Gaze point right Y (DACSmm)',
       'Gaze point Y (MCSnorm)', 'Gaze point right Y (MCSnorm)',
       'Gaze event duration', 'Fixation point Y','Eye movement type_Fixation','Pupil diameter left', 'Pupil diameter right']


# Split data into training and testing sets will use selected_feat which we found before
X_train, X_test, y_train, y_test = train_test_split(df[selected_feat], df['Avg Empathy score'], test_size=0.2, random_state=42)

# making multiple regression models using cross-validation
models = {'Linear Regression': LinearRegression(),
          'Ridge Regression': Ridge(),
          'Lasso Regression': Lasso(),
          'Random Forest Regression': RandomForestRegressor()}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f'{name}:')
    print(f'R-squared scores: {scores}')
    print(f'Mean R-squared score: {scores.mean():.3f}')
    print('')

# Output the best model based on cross-validation scores
best_model = max(models, key=lambda x: cross_val_score(models[x], X_train, y_train, cv=5, scoring='r2').mean())
print(f'Best model: {best_model}')


After adding ['Eye movement type_Fixation','Pupil diameter left', 'Pupil diameter right'] our model and got the same Mean R-squared score: 0.68 of lasso regression

Now we are start working on Test Group 

### Importing Test Dataset

In [None]:
# first we are importing test group dataset
directory1 = 'C:/Users/Rizwan/Desktop/DDM/empathy/data/test_group'

#loading all the csv files
files_path1 = glob.glob(directory1 + "/*.csv")
all_dfss = []

for file in files_path1:
    df = pd.read_csv(file)
    all_dfss.append(df)

test_group = pd.concat(all_dfss, ignore_index=True)

print(test_group)


In [None]:
test_group.shape

we have total 211474 rows and 71 columns

In [None]:
test_group.info()

In [None]:
test_group.describe()

In [None]:
# creating a copy of control_group dataframe
df2 = test_group.copy()

In [None]:
df2.head()

In [None]:
## converting all the datatypes with the actual data types


# Converting these columns into datetime their unit are milisecond
ml_date_cols = ['Recording timestamp', 'Computer timestamp', 'Recording duration', 'Eyetracker timestamp']

for col in ml_date_cols:
    df2[col] = pd.to_datetime(df2[col], unit='ms')


# Converting these columns into datetime 
date_cols = ['Export date', 'Recording date', 'Recording date UTC', 'Recording start time', 'Recording start time UTC']

for col in date_cols:
    df2[col] = pd.to_datetime(df2[col])


# Converting these columns into float and replace the (,) with the decimal point (.)

split_cols = ['Gaze direction left X','Gaze direction left Y','Gaze direction left Z','Gaze direction right X','Gaze direction right Y','Gaze direction right Z',
           'Pupil diameter left','Pupil diameter right','Eye position left X (DACSmm)','Eye position left Y (DACSmm)','Eye position left Z (DACSmm)',
           'Eye position right X (DACSmm)','Eye position right Y (DACSmm)','Eye position right Z (DACSmm)','Gaze point left X (DACSmm)',
           'Gaze point left Y (DACSmm)','Gaze point right X (DACSmm)','Gaze point right Y (DACSmm)','Gaze point X (MCSnorm)','Gaze point Y (MCSnorm)',
           'Gaze point left X (MCSnorm)','Gaze point left Y (MCSnorm)','Gaze point right X (MCSnorm)','Gaze point right Y (MCSnorm)',
           'Fixation point X (MCSnorm)','Fixation point Y (MCSnorm)'
       ]

for column in split_cols:
    df2[column] =df2[column].str.replace(',', '.').astype(float)
    

In [None]:
df2.head()

In [None]:
df2.info()

all data types are converted to their respective data Because this is the tie series dataset so we need to set the index to Recording timestamp

In [None]:
# Set the index to Recording timestamp because this is the time series dataset
df2.set_index('Recording timestamp', inplace=True)

In [None]:
df2.head()

Removing below columns I did in assigment 1 here is the link

https://github.com/rizwannathani/Data-Science-and-Decision-Making/blob/63026313a95f1bc904268c27947f3c863fec2a84/DDM%20Assignment%201.ipynb

In [None]:
#Removing these columns which has no change
df2 = df2.drop(['Project name', 'Export date','Recording date','Computer timestamp','Recording date','Recording date UTC','Recording start time'
                ,'Recording start time UTC','Timeline name','Event', 'Event value', 'Mouse position X', 'Mouse position Y','Recording Fixation filter name',
                'Recording software version', 'Recording resolution height', 'Eyetracker timestamp', 'Recording resolution width', 'Recording monitor latency','Unnamed: 0'], axis=1)
df2.head()

### Finding the Nan values

In [None]:
# making a list of all the nan values which are present in out dataset
na_features = [features for features in df2.columns if df2[features].isnull().sum()>1]

for features in na_features:
    print(features, '=', np.round(df2[features].isnull().mean(),2)*100, '% Missing Values')

no features are having more than 80% nan values

In [None]:
#we are doing imputation to remove the missing data by backward and forward fill technique
df2.interpolate(method='ffill', inplace=True)
df2.interpolate(method='bfill', inplace=True)

In [None]:
# lets check again how many total missing data we have

miss_count = df2.isna().sum()
total_na = miss_count.sum()
print(f'Total NaN values: {total_na}')


### Finding numerical features

In [None]:
#finding the columns which are having numerical data
numerical_features = [feature for feature in df2.columns if df2[feature].dtypes !='O']

print('Number of Numerical Columns: ', len(numerical_features))

df2[numerical_features].head()

### Finding categorical features

In [None]:
#finding the columns which are having non numeric data
categorical_features=[feature for feature in df2.columns if df2[feature].dtypes=='O']
categorical_features

In [None]:
df2[categorical_features].head()

In [None]:
#finding how many categories we have in a categorical columns
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,len(df2[feature].unique())))

In [None]:
#let find the frequecny of eye type movement we have 
eye_movement_counts = df2['Eye movement type'].value_counts()

plt.pie(eye_movement_counts, labels=None, autopct='%1.1f%%')

plt.legend(eye_movement_counts.index, title='Eye Movement Types', loc='center left', bbox_to_anchor=(1.1, 0.5))

plt.title('Percentage of Eye Movement Types')

plt.show()


Due to the complexity of the data there are so many rows for each participants respect to each trials (i.e Recording name). We will take a median of the numerical columns to deal with the outliers

In [None]:
# Exclude the columns ['Recording duration'] from numerical features
numerical_features = [col for col in numerical_features if col not in ['Recording duration']]

# Create a new dataframe to store the median values with participant name and recording name
median_df2 = pd.DataFrame(columns=['Participant name', 'Recording name'] + numerical_features)

# Group by ['Participant name'] and each ['Recording name'] (i.e each trial)
grouped_df = df2.groupby(['Participant name', 'Recording name'])

# Calculate the median of each numerical column for each group
for name, group in grouped_df:
    medians = [np.median(group[col]) for col in numerical_features]
    medians.insert(0, name[0]) # Participant name
    medians.insert(1, name[1]) # Recording name
    median_df2.loc[len(median_df2)] = medians


median_df2.head(15)


now we have a dataframe contain all the median values of numerical features with respect to the each participant and recording name now we do counts for the categorical features

In [None]:
df = df2.copy()
# defining the categorical features 
categorical_columns = ['Sensor', 'Validity left', 'Validity right', 'Presented Stimulus name', 'Presented Media name','Eye movement type']

# Define the features to group by
group_by_columns = ['Participant name', 'Recording name']

# making a new DataFrame to store the counts
counts_df = pd.DataFrame()

for col in categorical_columns:
    col_counts = df.groupby(group_by_columns)[col].value_counts().unstack(fill_value=0)
    col_counts.columns = [f"{col}_{val}" for val in col_counts.columns]
    counts_df = pd.concat([counts_df, col_counts], axis=1)

counts_df = counts_df.reset_index()

# Merge the counts_df DataFrame with the median_df DataFrame
final_df2 = pd.merge(median_df2, counts_df, on=['Participant name', 'Recording name'])


In [None]:
final_df2.head(20)

Now we have our final dataframe which has all the median values of numerical data and the count of categorical data now we need to add a empathy score in this datafram for making of our model

In [None]:
# Define regular expression to extract numbers
pattern = re.compile(r'\d+')

# Extract numbers from strings and create new column
final_df2['Participant name'] = final_df2['Participant name'].apply(lambda x: int(pattern.search(x).group()))
final_df2 = final_df2.rename(columns={'Participant name': 'Participant no'})


# Output result
final_df2.head()


In [None]:
final_df2.head(21)

Now we need to add a empathy score to our final dataframe because we have 2 questions dataset 1A and 1B and both have different empathy score we will take an average of these 2 scores and then add the average score in our final datafarme with respect to each particpant and recording(i.e trail)

we did this above so we just merge those empathy score to corespond participant

In [None]:
final_df2 = final_df2.merge(score_df, on='Participant no', how='left')

In [None]:
final_df2.head(10)

Now we have complete one dataframe which has all the median values and counts of categorical data and also the average of the empathy score will use this dataframe to create our model

In [None]:
#saving this file to csv
final_df2.to_csv("finaldata2.csv")

In [None]:
#reading that file
final_df2 = pd.read_csv("finaldata2.csv")
final_df2.head()

In [None]:
# removing first index
final_df2 = final_df2.drop(final_df2.columns[0], axis=1)


In [None]:
final_df2.head()

## Feature Selection

In [None]:
# Separate target variable and features
X = final_df2.drop(['Avg Empathy score','Participant no','Recording name'], axis=1)
y = final_df2['Avg Empathy score']

# Select top 15 features based on correlation with target variable
selector = SelectKBest(f_regression, k=15)
X_selected = selector.fit_transform(X, y)
mask = selector.get_support()  
selected_feat2 = X.columns[mask]  

print("Selected Features:", selected_feat2)


We have selected the top 5 features which are in selected_feat2 variable we use these features to make regression models

## Applying Regression Models

In [None]:
df = final_df2.copy()

# Split data into training and testing sets will use selected_feat which we found before
X_train, X_test, y_train, y_test = train_test_split(df[selected_feat2], df['Avg Empathy score'], test_size=0.2, random_state=42)

# making multiple regression models using cross-validation
models = {'Linear Regression': LinearRegression(),
          'Ridge Regression': Ridge(),
          'Lasso Regression': Lasso(),
          'Random Forest Regression': RandomForestRegressor()}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f'{name}:')
    print(f'R-squared scores: {scores}')
    print(f'Mean R-squared score: {scores.mean():.3f}')
    print('')

# Output the best model based on cross-validation scores
best_model = max(models, key=lambda x: cross_val_score(models[x], X_train, y_train, cv=5, scoring='r2').mean())
print(f'Best model: {best_model}')


with the Cross validation approach for model evaluation we got Random Forest Regression as the best model with Mean R-squared score of 0.70

### Adding some more important features

Because in research paper it was mentioned that the pupil diameter is important so we will add pupil diameter faeture and also the count of fixation occur because fixation occurs the most in the eye movement

In [None]:
# using cross validation

df = final_df2.copy()
selected_feature = ['Gaze point X', 'Gaze point right X', 'Gaze point right Y',
       'Gaze direction right X', 'Eye position left X (DACSmm)',
       'Eye position right X (DACSmm)', 'Gaze point right X (DACSmm)',
       'Gaze point right Y (DACSmm)', 'Gaze point X (MCSnorm)',
       'Gaze point right X (MCSnorm)', 'Gaze event duration',
       'Fixation point X', 'Fixation point X (MCSnorm)',
       'Presented Stimulus name_103111957_1133015250388940_5990313860353693579_n (1)',
       'Presented Media name_103111957_1133015250388940_5990313860353693579_n.jpg',
       'Eye movement type_Fixation','Pupil diameter left', 'Pupil diameter right']

# Split data into training and testing sets will use selected_feat which we found before
X_train, X_test, y_train, y_test = train_test_split(df[selected_feature], df['Avg Empathy score'], test_size=0.2, random_state=42)

# making multiple regression models using cross-validation
models = {'Linear Regression': LinearRegression(),
          'Ridge Regression': Ridge(),
          'Lasso Regression': Lasso(),
          'Random Forest Regression': RandomForestRegressor()}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f'{name}:')
    print(f'R-squared scores: {scores}')
    print(f'Mean R-squared score: {scores.mean():.3f}')
    print('')

# Output the best model based on cross-validation scores
best_model = max(models, key=lambda x: cross_val_score(models[x], X_train, y_train, cv=5, scoring='r2').mean())
print(f'Best model: {best_model}')


After adding ['Eye movement type_Fixation','Pupil diameter left', 'Pupil diameter right'] these features we increase the performance of our model and got Mean R-squared score: 0.685 of Linear Regression