# The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes.
Apply feature selection techniques

Attribute Information:
1. mpg: continuous
2. cylinders: multi-valued discrete
3. displacement: continuous
4. horsepower: continuous
5. weight: continuous
6. acceleration: continuous
7. model year: multi-valued discrete
8. origin: multi-valued discrete
9. car name: string (unique for each instance)

In [82]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Q1. Import libraries and Load the Data file into Python DataFrame and view top 2 rows

In [66]:
df=pd.read_csv("car-mpg.csv")
df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [67]:
df.shape

(398, 10)

# Q2.  Print the datatypes of each column and check for missing values if any. Perform descriptive analysis

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    object 
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
 9   car_name  398 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 31.2+ KB


In [69]:
df.isnull().sum()

mpg         0
cyl         0
disp        0
hp          0
wt          0
acc         0
yr          0
origin      0
car_type    0
car_name    0
dtype: int64

In [70]:
df.isna().sum()

mpg         0
cyl         0
disp        0
hp          0
wt          0
acc         0
yr          0
origin      0
car_type    0
car_name    0
dtype: int64

In [71]:
df.value_counts()

mpg   cyl  disp   hp   wt    acc   yr  origin  car_type  car_name                            
9.0   8    304.0  193  4732  18.5  70  1       0         hi 1200d                                1
27.0  4    151.0  90   2950  17.3  82  1       1         chevrolet camaro                        1
           140.0  86   2790  15.6  82  1       1         ford mustang gl                         1
           112.0  88   2640  18.6  82  1       1         chevrolet cavalier wagon                1
           101.0  83   2202  15.3  76  2       1         renault 12tl                            1
                                                                                                ..
18.6  6    225.0  110  3620  18.7  78  1       0         dodge aspen                             1
18.5  8    360.0  150  3940  13.0  79  1       0         chrysler lebaron town @ country (sw)    1
      6    250.0  98   3525  19.0  77  1       0         ford granada                            1
               

In [72]:
df.describe()

Unnamed: 0,mpg,cyl,disp,wt,acc,yr,origin,car_type
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864,0.530151
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055,0.499718
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0,1.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0,1.0


In [73]:
df_final=df.drop("car_name",axis=1)

In [74]:
df_final.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,1,0
1,15.0,8,350.0,165,3693,11.5,70,1,0
2,18.0,8,318.0,150,3436,11.0,70,1,0
3,16.0,8,304.0,150,3433,12.0,70,1,0
4,17.0,8,302.0,140,3449,10.5,70,1,0


In [75]:
df_final[df_final["hp"]=="?"]
df_final.replace('?', pd.NA, inplace=True)
df_final[df_final["hp"]=="?"]
df_final['hp'] = pd.to_numeric(df_final['hp'], errors='coerce')
median_value = df_final['hp'].median()
df_final['hp'].fillna(median_value, inplace=True)

# Q3. Split the dataset into training and test sets and fit the model. Also calculate R square

In [76]:
X=df_final.iloc[:,1:]
y=df_final['mpg']
scaler = StandardScaler()
X_scaled=scaler.fit_transform(X)
print(X.head())
print(y.head())
print(X.info())
print(y.info())
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)



   cyl   disp     hp    wt   acc  yr  origin  car_type
0    8  307.0  130.0  3504  12.0  70       1         0
1    8  350.0  165.0  3693  11.5  70       1         0
2    8  318.0  150.0  3436  11.0  70       1         0
3    8  304.0  150.0  3433  12.0  70       1         0
4    8  302.0  140.0  3449  10.5  70       1         0
0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   cyl       398 non-null    int64  
 1   disp      398 non-null    float64
 2   hp        398 non-null    float64
 3   wt        398 non-null    int64  
 4   acc       398 non-null    float64
 5   yr        398 non-null    int64  
 6   origin    398 non-null    int64  
 7   car_type  398 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 25.0 KB
None
<class 'pandas.core.series.Series'>
RangeInd

In [77]:
model = LinearRegression()
model.fit(X_train, y_train)

In [78]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Q4. Calculate Mean Square Error for both train and test set

In [79]:
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
# Output the results
print("Training Data:")
print(f"Mean Squared Error: {mse_train}")
print(f"R-squared: {r2_train}")

print("\nTesting Data:")
print(f"Mean Squared Error: {mse_test}")
print(f"R-squared: {r2_test}")

Training Data:
Mean Squared Error: 10.427315339825856
R-squared: 0.8336851100311102

Testing Data:
Mean Squared Error: 7.247917060646503
R-squared: 0.8651962719656998


# Q5. Evaluate the model using cross validation

In [87]:
num_folds = 50
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
results = cross_val_score(model, X_scaled, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

# Q6. Apply Feature selection method to find which features fits well and do the plotting

# Q7. Now build full model with selected features

# Q8. Build full model on all features as comparison