In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
df = pd.read_csv('./autoR.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [4]:
for i in df:
    print(i)

mpg
cylinders
displacement
horsepower
weight
acceleration
model year
origin
car name


In [5]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [6]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [7]:
df.shape

(398, 9)

In [8]:
df.index

RangeIndex(start=0, stop=398, step=1)

In [9]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [10]:
print(df['mpg'].isna().sum())
print(df['cylinders'].isna().sum())
print(df['displacement'].isna().sum())
print(df['horsepower'].isna().sum())
print(df['weight'].isna().sum())
print(df['acceleration'].isna().sum())
print(df['model year'].isna().sum())
print(df['origin'].isna().sum())
print(df['car name'].isna().sum())

0
0
0
0
0
0
0
0
0


In [11]:
df.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger
397,31.0,4,119.0,82,2720,19.4,82,1,chevy s-10


In [12]:
df.shape

(398, 9)

In [13]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [14]:
df['car name'].isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
393    False
394    False
395    False
396    False
397    False
Name: car name, Length: 398, dtype: bool

In [15]:
df.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320


In [16]:
# df['car name'] = pd.to_numeric(df['car name'])

In [17]:
le = LabelEncoder()
car_name_encoded = le.fit_transform(df['car name'])

In [18]:
df['car_name_encoded'] = car_name_encoded

In [19]:
print(df.dtypes)

mpg                 float64
cylinders             int64
displacement        float64
horsepower           object
weight                int64
acceleration        float64
model year            int64
origin                int64
car name             object
car_name_encoded      int64
dtype: object


In [20]:
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_name_encoded
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,49
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,36
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,231


In [21]:
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']]
y = df['mpg']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [33]:
print(X.head())

   cylinders  displacement horsepower  weight  acceleration  model year  \
0          8         307.0        130    3504          12.0          70   
1          8         350.0        165    3693          11.5          70   
2          8         318.0        150    3436          11.0          70   
3          8         304.0        150    3433          12.0          70   
4          8         302.0        140    3449          10.5          70   

   origin  
0       1  
1       1  
2       1  
3       1  
4       1  


In [34]:
print(y.head())
print(y.head().isna().isnull().sum())

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64
0


In [37]:
# regr = linear_model.LinearRegression()
# regr.fit(X_train, y_train)

# list(zip(car_name_encoded[1:8], reg.coef_))

In [39]:
# #Mean Squared error and R-squared on the training set
# preds = regr.predict(X_train)
# mse = np.mean((preds - y_train) ** 2)
# rsq = regr.score(X_train, y_train)

# print('Mean Squared Error: %.4f \n R-squared: %.4f' % (mse,rsq))

In [40]:
# #Test model on held out test set
# #Mean Squared error on the testing set

# preds_  = regr.predict(X_test)
# mse_ = np.mean((preds_ - y_test) ** 2)
# rsq_ = regr.score(X_test, y_test)

# print("Mean Squared Error: % 4f \n R-squared: %4f" % (mse_,rsq))

In [41]:
# %pylab inline
# #Predicted vs. errors plot -> demonstrates an issue with this fit (high bias)


# plt.figure(figsize=(16,7))
# plt.scatter(regr.predict(X_train), regr.predict(X_train)-y_train)
# plt.plot([-5,40], [0,0], color = "red")


# #place testing data on the plot as well
# plt.scatter(regr.predict(X_test), regr.predict(X_test)-y_test, color="yellow")

In [None]:
df['car name'] = df['car name'].astype("object")

In [None]:
df.dtypes