In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('bike_specs.csv')

In [3]:
df.head()

Unnamed: 0,Engine_Capacity,Max_Power,Mileage,Starting_Mechanism,Ignition,Gears,Price
0,1380.0 CC,96.60 bhp @ 6500 rpm,15 Km/l,Self Start,Fuel Injection,6 Speed,2778000
1,1745.0 CC,65.00 bhp @ 5500 rpm,15 Km/l,Self Start,Electronic Sequential Port Fuel Injection (ESPFI),6 Speed,3053000
2,1811.0 CC,100.00 bhp @ 3000 rpm,20 Km/l,Self Start,Closed Loop Fuel Injection,6 Speed,3201000
3,1745.0 CC,,15 Km/l,Self Start,Electronic Sequential Port Fuel Injection (ESPFI),6 Speed,3299000
4,1811.0 CC,100.00 bhp @ 8000 rpm,14 Km/l,Self Start,Closed Loop Fuel Injection,6 Speed,3350000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 7 columns):
Engine_Capacity       13 non-null object
Max_Power             10 non-null object
Mileage               13 non-null object
Starting_Mechanism    13 non-null object
Ignition              13 non-null object
Gears                 13 non-null object
Price                 13 non-null int64
dtypes: int64(1), object(6)
memory usage: 856.0+ bytes


In [5]:
# Replace nan values with most repeated values
Max_power_list = np.array(df['Max_Power'].value_counts().index.tolist())
most_repeated = Max_power_list[0]
df['Max_Power'] = df['Max_Power'].mask(df['Max_Power'].isna(), most_repeated)

In [6]:
# Removing unnecessary texts
df['Engine_Capacity'] = df['Engine_Capacity'].map(lambda x: x.replace('CC',''))
df['Max_Power'] = df['Max_Power'].map(lambda x: x.split('@'))
df['bhp'] = df['Max_Power'].map(lambda x: x[0].replace('bhp',''))
df['rmp'] = df['Max_Power'].map(lambda x: x[1].replace('rpm',''))
df['Mileage'] = df['Mileage'].map(lambda x: x.replace('Km/l', ''))

In [7]:
#replacing empty string with nan and remove it
df = df.replace(r'^\s*$', np.nan, regex=True)
df.dropna(inplace=True)

In [8]:
df['Ignition'].value_counts()

Closed Loop Fuel Injection                           4
Fuel Injection                                       3
Electronic Sequential Port Fuel Injection (ESPFI)    3
Mitsubishi Electronic Fuel Injection System          1
Fuel injection: ø50 mm x 4 with dual injection       1
Name: Ignition, dtype: int64

In [9]:
# Replace least repeated values with most repeated values
Ignition_list = np.array(df['Ignition'].value_counts().index.tolist())
most_repeated = Ignition_list[0]
least_repeated = df['Ignition'].value_counts().values < 2
least_repeated = Ignition_list[least_repeated]
df['Ignition'] = df['Ignition'].map(lambda x: most_repeated if x in list(least_repeated) else x)

In [10]:
#Getting dummies value for Ignition col
ign_df = pd.get_dummies(df['Ignition'], drop_first=True)
df = pd.concat([df, ign_df], axis=1)

In [11]:
df.head()

Unnamed: 0,Engine_Capacity,Max_Power,Mileage,Starting_Mechanism,Ignition,Gears,Price,bhp,rmp,Electronic Sequential Port Fuel Injection (ESPFI),Fuel Injection
0,1380.0,"[96.60 bhp , 6500 rpm]",15,Self Start,Fuel Injection,6 Speed,2778000,96.6,6500,0,1
1,1745.0,"[65.00 bhp , 5500 rpm]",15,Self Start,Electronic Sequential Port Fuel Injection (ESPFI),6 Speed,3053000,65.0,5500,1,0
2,1811.0,"[100.00 bhp , 3000 rpm]",20,Self Start,Closed Loop Fuel Injection,6 Speed,3201000,100.0,3000,0,0
3,1745.0,"[100.00 bhp , 3000 rpm]",15,Self Start,Electronic Sequential Port Fuel Injection (ESPFI),6 Speed,3299000,100.0,3000,1,0
4,1811.0,"[100.00 bhp , 8000 rpm]",14,Self Start,Closed Loop Fuel Injection,6 Speed,3350000,100.0,8000,0,0


In [12]:
# dropping unnecessary columns
drop_cols = ['Max_Power', 'Starting_Mechanism', 'Ignition', 'Gears']
df = df.drop(drop_cols, axis=1)

In [13]:
df.dtypes

Engine_Capacity                                      object
Mileage                                              object
Price                                                 int64
bhp                                                  object
rmp                                                  object
Electronic Sequential Port Fuel Injection (ESPFI)     uint8
Fuel Injection                                        uint8
dtype: object

In [14]:
# Converting datatypes into float
df['Engine_Capacity'] = df['Engine_Capacity'].map(lambda x: float(x))
df['Mileage'] = df['Mileage'].map(lambda x: float(x))
df['bhp'] = df['bhp'].map(lambda x: float(x))
df['rmp'] = df['rmp'].map(lambda x: float(x))

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [16]:
# Dependent and independent value
X = df.drop(['Price'], axis=1).values
y = df['Price'].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [18]:
# Linear model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [19]:
model.score(X_test, y_test)  

0.7326334010388762