In [10]:
from scripts.info_extractor import DataFrameInfo
from scripts.transformer import DataTransform

import pandas as pd
import numpy as np
import sklearn

### **Import and clean data**

In [11]:
data = pd.read_csv('Data/auto-mpg.csv')
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [12]:
# Let's look at the shape of the dataframe
data.shape

(398, 9)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [14]:
# No clear missing values or nulls/NaNs - they might be hidden as typos/inconsistent values. Follow up

In [15]:
# Let's first create instances of the DataFrameInfo class and the DataTransform class

info_extractor = DataFrameInfo(data)
transformer = DataTransform(data)

In [16]:
info_extractor.count_distinct_values()

Unnamed: 0_level_0,distinct_values_count
column,Unnamed: 1_level_1
mpg,129
cylinders,5
displacement,82
horsepower,94
weight,351
acceleration,95
model year,13
origin,3
car name,305


In [17]:
# Are there any duplicates? If yes, drop them
data.duplicated().value_counts()

# There are no duplicated rows in our dataset

False    398
Name: count, dtype: int64

In [18]:
# We can assume the model year column refers to the years 1970 - 1982. Need to fix datatype
# Origin seems to be a categorical variable that has already been encoded numerically 

info_extractor.show_distinct_values(['cylinders','model year', 'origin'])

Unique values in cylinders: [3 4 5 6 8]
Unique values in model year: [70 71 72 73 74 75 76 77 78 79 80 81 82]
Unique values in origin: [1 2 3]


In [19]:
# Need to convert origin column as categorical feature 
# When doing feature engineering later on: encode it as dummy variables (3-1)

data = transformer.convert_columns(column_list=['origin'], data_type='categorical')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float64 
 1   cylinders     398 non-null    int64   
 2   displacement  398 non-null    float64 
 3   horsepower    398 non-null    object  
 4   weight        398 non-null    int64   
 5   acceleration  398 non-null    float64 
 6   model year    398 non-null    int64   
 7   origin        398 non-null    category
 8   car name      398 non-null    object  
dtypes: category(1), float64(3), int64(3), object(2)
memory usage: 25.5+ KB


In [34]:
# Let's also convert model year to datetime data type 
data['model year'] = pd.to_datetime(data['model year']).dt.year
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float64 
 1   cylinders     398 non-null    int64   
 2   displacement  398 non-null    float64 
 3   horsepower    398 non-null    object  
 4   weight        398 non-null    int64   
 5   acceleration  398 non-null    float64 
 6   model year    398 non-null    int32   
 7   origin        398 non-null    category
 8   car name      398 non-null    object  
dtypes: category(1), float64(3), int32(1), int64(2), object(2)
memory usage: 24.0+ KB


In [37]:
data.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year
count,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,5140.0,24.8,82.0
