# Pandas: grouping & joining

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cars = pd.read_csv("/content/drive/MyDrive/OLD_CURR/UNIT2/DAY2/data/vehicles.csv")
cars_original = cars.copy()

In [5]:
cars.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [6]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Make                     35952 non-null  object 
 1   Model                    35952 non-null  object 
 2   Year                     35952 non-null  int64  
 3   Engine Displacement      35952 non-null  float64
 4   Cylinders                35952 non-null  float64
 5   Transmission             35952 non-null  object 
 6   Drivetrain               35952 non-null  object 
 7   Vehicle Class            35952 non-null  object 
 8   Fuel Type                35952 non-null  object 
 9   Fuel Barrels/Year        35952 non-null  float64
 10  City MPG                 35952 non-null  int64  
 11  Highway MPG              35952 non-null  int64  
 12  Combined MPG             35952 non-null  int64  
 13  CO2 Emission Grams/Mile  35952 non-null  float64
 14  Fuel Cost/Year        

First exploration of the dataset:

- How many observations does it have?
- Look at all the columns: do you understand what they mean?
- Look at the raw data: do you see anything weird?
- Look at the data types: are they the expected ones for the information the column contains?

In [7]:
cars.shape

(35952, 15)

### Cleaning and wrangling data

- Some car brand names refer to the same brand. Replace all brand names that contain the word "Dutton" for simply "Dutton". If you find similar examples, clean their names too. Use `loc` with boolean indexing.

- Convert CO2 Emissions from Grams/Mile to Grams/Km

- Create a binary column that solely indicates if the transmission of a car is automatic or manual. Use `pandas.Series.str.startswith` and .

- Bonus: convert MPG columns to km_per_liter

In [8]:
cars['Make'].value_counts()

Chevrolet                 3643
Ford                      2946
Dodge                     2360
GMC                       2347
Toyota                    1836
                          ... 
Goldacre                     1
Fisker                       1
Lambda Control Systems       1
E. P. Dutton, Inc.           1
Aurora Cars Ltd              1
Name: Make, Length: 127, dtype: int64

In [9]:
list(cars['Make'].unique())

['AM General',
 'ASC Incorporated',
 'Acura',
 'Alfa Romeo',
 'American Motors Corporation',
 'Aston Martin',
 'Audi',
 'Aurora Cars Ltd',
 'Autokraft Limited',
 'BMW',
 'BMW Alpina',
 'Bentley',
 'Bertone',
 'Bill Dovell Motor Car Company',
 'Bitter Gmbh and Co. Kg',
 'Bugatti',
 'Buick',
 'CCC Engineering',
 'CX Automotive',
 'Cadillac',
 'Chevrolet',
 'Chrysler',
 'Consulier Industries Inc',
 'Dabryan Coach Builders Inc',
 'Dacia',
 'Daewoo',
 'Daihatsu',
 'Dodge',
 'E. P. Dutton, Inc.',
 'Eagle',
 'Environmental Rsch and Devp Corp',
 'Evans Automobiles',
 'Excalibur Autos',
 'Federal Coach',
 'Ferrari',
 'Fiat',
 'Fisker',
 'Ford',
 'GMC',
 'General Motors',
 'Genesis',
 'Geo',
 'Goldacre',
 'Grumman Allied Industries',
 'Grumman Olson',
 'Honda',
 'Hummer',
 'Hyundai',
 'Import Foreign Auto Sales Inc',
 'Import Trade Services',
 'Infiniti',
 'Isis Imports Ltd',
 'Isuzu',
 'J.K. Motors',
 'JBA Motorcars, Inc.',
 'Jaguar',
 'Jeep',
 'Kia',
 'Laforza Automobile Inc',
 'Lambda Control

In [10]:
"Dutton" in "E. P. Dutton, Inc."

True

In [11]:
"Dutton" in "Ford"

False

In [None]:
#new_values = []

#for i in cars['Make']:
#    if ( "Dutton" in i ):
#        new_values.append("Dutton")
#    else:
#        new_values.append(i)
#    
#cars['Make'] = new_values

In [None]:
#cars['Make'] = [ "Dutton" if ( "Dutton" in car ) else car for car in cars['Make'] ]

In [None]:
#cars['Make'] = list( map(lambda x: "Dutton" if( "Dutton" in x ) else x , cars['Make']) )

In [None]:
#cars['Make'] = np.where("Dutton" in cars['Make'],"Dutton",cars['Make'] )

Let'show to do the first replacement with:

* for loop
* **map( , )** and a lambda function
* np.where(condition, if condition True,if condition False)

In [12]:
#for make in cars['Make']:
#    if ( "Dutton" in make ):
#        make = "Dutton"

#y = lambda x: "Dutton" if ( "Dutton" in x ) else x
cars['Make'] = list(map(lambda x: "Dutton" if ( "Dutton" in x ) else x, cars['Make']))
#cars['Make'] = np.where("Dutton" in cars['Make'],"Dutton", cars['Make'])

In [13]:
cars['Make'].unique()

array(['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo',
       'American Motors Corporation', 'Aston Martin', 'Audi',
       'Aurora Cars Ltd', 'Autokraft Limited', 'BMW', 'BMW Alpina',
       'Bentley', 'Bertone', 'Bill Dovell Motor Car Company',
       'Bitter Gmbh and Co. Kg', 'Bugatti', 'Buick', 'CCC Engineering',
       'CX Automotive', 'Cadillac', 'Chevrolet', 'Chrysler',
       'Consulier Industries Inc', 'Dabryan Coach Builders Inc', 'Dacia',
       'Daewoo', 'Daihatsu', 'Dodge', 'Dutton', 'Eagle',
       'Environmental Rsch and Devp Corp', 'Evans Automobiles',
       'Excalibur Autos', 'Federal Coach', 'Ferrari', 'Fiat', 'Fisker',
       'Ford', 'GMC', 'General Motors', 'Genesis', 'Geo', 'Goldacre',
       'Grumman Allied Industries', 'Grumman Olson', 'Honda', 'Hummer',
       'Hyundai', 'Import Foreign Auto Sales Inc',
       'Import Trade Services', 'Infiniti', 'Isis Imports Ltd', 'Isuzu',
       'J.K. Motors', 'JBA Motorcars, Inc.', 'Jaguar', 'Jeep', 'Kia',
       '

In [14]:
cars['Make'] = list(map(lambda x: "BMW" if ( "BMW" in x ) else x, cars['Make']))

In [15]:
cars['Make'] = list(map(lambda x: "AMG" if ( "AM" in x ) else x, cars['Make']))

In [16]:
cars['Make'] = list(map(lambda x: "ASC" if ( "ASC " in x ) else x, cars['Make']))

In [17]:
cars['Make'] = list(map(lambda x: "Grumman" if ( "Grumman " in x ) else x, cars['Make']))

In [18]:
cars['Make'] = list(map(lambda x: "PAS, Inc" if ( "PAS " in x ) else x, cars['Make']))

In [19]:
cars['Make'].value_counts()

Chevrolet                           3643
Ford                                2946
Dodge                               2360
GMC                                 2347
Toyota                              1836
                                    ... 
General Motors                         1
Environmental Rsch and Devp Corp       1
London Taxi                            1
Lambda Control Systems                 1
Goldacre                               1
Name: Make, Length: 122, dtype: int64

Converting Grams/Mile to Grams/Km

1 Mile = 1.60934 Km

Grams/Mile * Mile/Km -> Grams/Mile * 1 Mile/1.60934Km

$$ \frac{Grams}{Mile} * \frac{Mile}{Km} $$

$$ \frac{Grams}{Mile} * \frac{1 Mile}{1.60934Km}  $$

In [20]:
list(cars.columns)

['Make',
 'Model',
 'Year',
 'Engine Displacement',
 'Cylinders',
 'Transmission',
 'Drivetrain',
 'Vehicle Class',
 'Fuel Type',
 'Fuel Barrels/Year',
 'City MPG',
 'Highway MPG',
 'Combined MPG',
 'CO2 Emission Grams/Mile',
 'Fuel Cost/Year']

In [21]:
cars['CO2 Emission Grams/Km'] = list(map(lambda x: x / 1.60934  ,cars['CO2 Emission Grams/Mile']))

In [22]:
list(cars.columns)

['Make',
 'Model',
 'Year',
 'Engine Displacement',
 'Cylinders',
 'Transmission',
 'Drivetrain',
 'Vehicle Class',
 'Fuel Type',
 'Fuel Barrels/Year',
 'City MPG',
 'Highway MPG',
 'Combined MPG',
 'CO2 Emission Grams/Mile',
 'Fuel Cost/Year',
 'CO2 Emission Grams/Km']

In [23]:
cars = cars.drop(columns="CO2 Emission Grams/Mile")
#cars.drop(columns="CO2 Emission Grams/Mile", inplace=True)

In [24]:
list(cars.columns)

['Make',
 'Model',
 'Year',
 'Engine Displacement',
 'Cylinders',
 'Transmission',
 'Drivetrain',
 'Vehicle Class',
 'Fuel Type',
 'Fuel Barrels/Year',
 'City MPG',
 'Highway MPG',
 'Combined MPG',
 'Fuel Cost/Year',
 'CO2 Emission Grams/Km']

Replacing the column `Transmission` with either Transmission or Manual

In [25]:
cars['Transmission'].head()

0    Automatic 3-spd
1    Automatic 3-spd
2    Automatic 3-spd
3    Automatic 3-spd
4    Automatic 4-spd
Name: Transmission, dtype: object

In [26]:
cars['Transmission'].unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

In [27]:
#cars['Transmission'].head()
cars[['Transmission']].head()


Unnamed: 0,Transmission
0,Automatic 3-spd
1,Automatic 3-spd
2,Automatic 3-spd
3,Automatic 3-spd
4,Automatic 4-spd


In [None]:
cars['Transmission'].unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

In [None]:
#def preprocess(df):
#    df2 = df.copy()
#    df2 = function1(df2, col)
#    df2 = function2(df2, col)
#    ....
#    return df2

#df = preprocess(df)

#def replace_auto(x):
#    if ( "Auto" in x ):
#        return "Automatic"
#    else:
#        return "Manual"

In [28]:
#cars['Transmission'] = list( map(replace_auto,cars['Transmission']) )
cars['Transmission'] = list( map(lambda x: "Automatic" if ("Auto" in x) else "Manual",cars['Transmission']) )

Bonus: convert MPG columns to km_per_liter

MPG = Miles/Gallon -> Km/Liter

1 Mile = 1.60934 Km

1 Gallon = 3.78541 Liters

$$ \frac{Miles}{Gallon} -> \frac{Miles}{Gallon} * \frac{Km}{Miles} * \frac{Gallon}{Liters}$$

$$ \frac{Miles}{Gallon} -> \frac{Miles}{Gallon} * \frac{1.60934Km}{ 1Miles} * \frac{1 Gallon}{3.78541 Liters}$$

* ( 1.60934 / 3.78541 )


In [None]:
list(cars.columns)

['Make',
 'Model',
 'Year',
 'Engine Displacement',
 'Cylinders',
 'Transmission',
 'Drivetrain',
 'Vehicle Class',
 'Fuel Type',
 'Fuel Barrels/Year',
 'City MPG',
 'Highway MPG',
 'Combined MPG',
 'Fuel Cost/Year',
 'CO2 Emission Grams/Km']

In [29]:
cars['City Km/Liter'] = list( map(lambda x: x * ( 1.60934 / 3.78541 ),cars['City MPG']) )

In [30]:
cars.drop(columns="City MPG", inplace=True)

In [31]:
cars['Highway Km/Liter'] = list( map(lambda x: x * ( 1.60934 / 3.78541 ),cars['Highway MPG']) )
cars.drop(columns="Highway MPG", inplace=True)

In [32]:
cars['Combined Km/Liter'] = list( map(lambda x: x * ( 1.60934 / 3.78541 ),cars['Combined MPG']) )
cars.drop(columns="Combined MPG", inplace=True)

### Gathering insights:

- How many car makers are there? How many models? Which car maker has the most cars in the dataset?

- When were these cars made?

- How big is the engine of these cars?

- What's the frequency of different transmissions, drivetrains and fuel types?

- What's the car that consumes the least/most fuel?

How many makes

In [33]:
len(cars['Make'].unique().tolist())

122

In [37]:
cars['Make'].value_counts()

Chevrolet                           3643
Ford                                2946
Dodge                               2360
GMC                                 2347
Toyota                              1836
                                    ... 
General Motors                         1
Environmental Rsch and Devp Corp       1
London Taxi                            1
Lambda Control Systems                 1
Goldacre                               1
Name: Make, Length: 122, dtype: int64

How many models

In [34]:
len(cars['Model'].unique().tolist())

3608

Which car Maker has more cars

In [38]:
make = cars['Make'].value_counts().index[0]
make

'Chevrolet'

In [41]:
cars.groupby('Make').count()

Unnamed: 0_level_0,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,Fuel Cost/Year,CO2 Emission Grams/Km,City Km/Liter,Highway Km/Liter,Combined Km/Liter
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AMG,4,4,4,4,4,4,4,4,4,4,4,4,4,4
ASC,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Acura,302,302,302,302,302,302,302,302,302,302,302,302,302,302
Alfa Romeo,41,41,41,41,41,41,41,41,41,41,41,41,41,41
American Motors Corporation,22,22,22,22,22,22,22,22,22,22,22,22,22,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Volkswagen,1047,1047,1047,1047,1047,1047,1047,1047,1047,1047,1047,1047,1047,1047
Volvo,717,717,717,717,717,717,717,717,717,717,717,717,717,717
Wallace Environmental,32,32,32,32,32,32,32,32,32,32,32,32,32,32
Yugo,8,8,8,8,8,8,8,8,8,8,8,8,8,8


In [42]:
cars.groupby('Make')['Model'].count()

Make
AMG                               4
ASC                               1
Acura                           302
Alfa Romeo                       41
American Motors Corporation      22
                               ... 
Volkswagen                     1047
Volvo                           717
Wallace Environmental            32
Yugo                              8
smart                            20
Name: Model, Length: 122, dtype: int64

In [43]:
cars.groupby('Make').count()['Model']

Make
AMG                               4
ASC                               1
Acura                           302
Alfa Romeo                       41
American Motors Corporation      22
                               ... 
Volkswagen                     1047
Volvo                           717
Wallace Environmental            32
Yugo                              8
smart                            20
Name: Model, Length: 122, dtype: int64

When the cars of the Make which has more cars were made?

In [44]:
cars[ cars['Make'] == "Chevrolet" ][['Make','Model','Year','Engine Displacement']] 

Unnamed: 0,Make,Model,Year,Engine Displacement
4275,Chevrolet,Astro 2WD (cargo),1985,2.5
4276,Chevrolet,Astro 2WD (cargo),1985,4.3
4277,Chevrolet,Astro 2WD (cargo),1985,4.3
4278,Chevrolet,Astro 2WD (cargo),1985,4.3
4279,Chevrolet,Astro 2WD (cargo),1985,2.5
...,...,...,...,...
7913,Chevrolet,Volt,2013,1.4
7914,Chevrolet,Volt,2014,1.4
7915,Chevrolet,Volt,2015,1.4
7916,Chevrolet,Volt,2016,1.5


In [45]:
cars['Transmission'].value_counts()

Automatic    24290
Manual       11662
Name: Transmission, dtype: int64

In [46]:
cars.columns

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'Fuel Cost/Year', 'CO2 Emission Grams/Km',
       'City Km/Liter', 'Highway Km/Liter', 'Combined Km/Liter'],
      dtype='object')

In [47]:
cars['Drivetrain'].value_counts()

Front-Wheel Drive             13044
Rear-Wheel Drive              12726
4-Wheel or All-Wheel Drive     6503
All-Wheel Drive                2039
4-Wheel Drive                  1058
2-Wheel Drive                   423
Part-time 4-Wheel Drive         158
2-Wheel Drive, Front              1
Name: Drivetrain, dtype: int64

In [48]:
cars['Fuel Type'].value_counts()

Regular                        23587
Premium                         9921
Gasoline or E85                 1195
Diesel                           911
Premium or E85                   121
Midgrade                          74
CNG                               60
Gasoline or natural gas           20
Premium and Electricity           20
Premium Gas or Electricity        17
Regular Gas and Electricity       16
Gasoline or propane                8
Regular Gas or Electricity         2
Name: Fuel Type, dtype: int64

CArs which consumes more or less at year.

Fuel Barrels/Year

In [49]:
cars['Fuel Barrels/Year'].max()

47.08714285714285

In [50]:
cars[ cars['Fuel Barrels/Year'] == cars['Fuel Barrels/Year'].max()]

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,Fuel Cost/Year,CO2 Emission Grams/Km,City Km/Liter,Highway Km/Liter,Combined Km/Liter
20894,Lamborghini,Countach,1986,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429,2.976
20895,Lamborghini,Countach,1987,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429,2.976
20896,Lamborghini,Countach,1988,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429,2.976
20897,Lamborghini,Countach,1989,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429,2.976
20898,Lamborghini,Countach,1990,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429,2.976


In [51]:
cars[ cars['Fuel Barrels/Year'] == cars['Fuel Barrels/Year'].min()]

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,Fuel Cost/Year,CO2 Emission Grams/Km,City Km/Liter,Highway Km/Liter,Combined Km/Liter
17395,Honda,Civic Natural Gas,2012,1.8,4.0,Automatic,Front-Wheel Drive,Compact Cars,CNG,0.06,1000,142.104437,11.478857,16.155428,13.179428
17396,Honda,Civic Natural Gas,2013,1.8,4.0,Automatic,Front-Wheel Drive,Compact Cars,CNG,0.06,1000,135.459257,11.478857,16.155428,13.179428
17397,Honda,Civic Natural Gas,2014,1.8,4.0,Automatic,Front-Wheel Drive,Compact Cars,CNG,0.06,1000,135.459257,11.478857,16.155428,13.179428
17398,Honda,Civic Natural Gas,2015,1.8,4.0,Automatic,Front-Wheel Drive,Compact Cars,CNG,0.06,1000,135.459257,11.478857,16.155428,13.179428


Drop the column "Combined MPG"

In [52]:
cars.drop(columns="Combined Km/Liter",inplace=True)

In [None]:
cars.columns

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'Fuel Cost/Year', 'CO2 Emission Grams/Km',
       'City Km/Liter', 'Highway Km/Liter'],
      dtype='object')

In [53]:
# Change column names to these ones:
col_names = ["Brand", "Model", "Year", "Engine_cc", "Cyl", "Trans", "Drivetrain", "Class", "Fuel_type", "Barrels_per_year", "City_MPG", "Highway_MPG", "CO2_grams_per_km", "Fuel_cost_per_year"]

In [61]:
col_names = [ item.replace(" ","_") for item in cars.columns ]
cars.columns = col_names

In [58]:
conversion = {"Make": "Brand", "Model":"Model","Year": "Year", "Engine Displacement": "Engine_cc", 
 "Cylinders":"Cyl", "Transmission":"Trans", "Drivetrain": "Drivetrain", "Vehicle Class":"Class",
 "Fuel Type":"Fuel_Type", "Fuel Barrels/Year": "Barrels_per_year"}

In [59]:
cars.rename(columns=conversion, inplace = True)

In [62]:
cars.columns

Index(['Brand', 'Model', 'Year', 'Engine_cc', 'Cyl', 'Trans', 'Drivetrain',
       'Class', 'Fuel_Type', 'Barrels_per_year', 'Fuel_Cost/Year',
       'CO2_Emission_Grams/Km', 'City_Km/Liter', 'Highway_Km/Liter'],
      dtype='object')

What brand has the most cars?

What brand has the worse CO2 Emissions on average?

Hint: use the function `sort_values()`

In [64]:
cars.sort_values("CO2_Emission_Grams/Km")

Unnamed: 0,Brand,Model,Year,Engine_cc,Cyl,Trans,Drivetrain,Class,Fuel_Type,Barrels_per_year,Fuel_Cost/Year,CO2_Emission_Grams/Km,City_Km/Liter,Highway_Km/Liter
3071,BMW,i3 REX,2016,0.6,2.0,Automatic,Rear-Wheel Drive,Subcompact Cars,Premium Gas or Electricity,1.563190,1050,22.990791,17.430857,15.730285
3069,BMW,i3 REX,2014,0.6,2.0,Automatic,Rear-Wheel Drive,Subcompact Cars,Premium Gas or Electricity,1.563190,1050,24.854909,17.430857,15.730285
3070,BMW,i3 REX,2015,0.6,2.0,Automatic,Rear-Wheel Drive,Subcompact Cars,Premium Gas or Electricity,1.563190,1050,24.854909,17.430857,15.730285
7916,Chevrolet,Volt,2016,1.5,4.0,Automatic,Front-Wheel Drive,Compact Cars,Regular Gas or Electricity,2.006844,800,31.690010,18.281143,17.856000
7917,Chevrolet,Volt,2017,1.5,4.0,Automatic,Front-Wheel Drive,Compact Cars,Regular Gas or Electricity,2.006844,800,31.690010,18.281143,17.856000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20897,Lamborghini,Countach,1989,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429
20894,Lamborghini,Countach,1986,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429
20898,Lamborghini,Countach,1990,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429
20896,Lamborghini,Countach,1988,5.2,12.0,Manual,Rear-Wheel Drive,Two Seaters,Premium,47.087143,5800,788.877073,2.550857,4.251429


In [66]:
cars.groupby("Brand")[["CO2_Emission_Grams/Km"]].mean()

Unnamed: 0_level_0,CO2_Emission_Grams/Km
Brand,Unnamed: 1_level_1
AMG,379.881345
ASC,345.133719
Acura,262.583000
Alfa Romeo,288.287195
American Motors Corporation,314.264744
...,...
Volkswagen,244.038998
Volvo,270.796572
Wallace Environmental,408.857065
Yugo,221.251107


In [67]:
cars.groupby("Brand")[["CO2_Emission_Grams/Km"]].mean().sort_values("CO2_Emission_Grams/Km")

Unnamed: 0_level_0,CO2_Emission_Grams/Km
Brand,Unnamed: 1_level_1
Fisker,105.011992
smart,153.498052
Fiat,189.311494
Daihatsu,192.742404
MINI,194.935105
...,...
Rolls-Royce,475.397772
Dutton,476.419879
Laforza Automobile Inc,502.012683
Bugatti,542.497235


In [68]:
cars.groupby("Brand")[["CO2_Emission_Grams/Km"]].mean().sort_values("CO2_Emission_Grams/Km",ascending=False)

Unnamed: 0_level_0,CO2_Emission_Grams/Km
Brand,Unnamed: 1_level_1
Vector,651.919248
Bugatti,542.497235
Laforza Automobile Inc,502.012683
Dutton,476.419879
Rolls-Royce,475.397772
...,...
MINI,194.935105
Daihatsu,192.742404
Fiat,189.311494
smart,153.498052


Use `pd.cut` or `pd.qcut` to create 4 groups (bins) of cars, by Year. We want to explore how cars have evolved decade by decade.

In [69]:
cars['Year'].describe()

count    35952.00000
mean      2000.71640
std         10.08529
min       1984.00000
25%       1991.00000
50%       2001.00000
75%       2010.00000
max       2017.00000
Name: Year, dtype: float64

In [70]:
cars['Decade'] = pd.cut(cars['Year'],bins=4,labels=['80s','90s','00s','10s'])

In [71]:
cars[['Year','Decade']]

Unnamed: 0,Year,Decade
0,1984,80s
1,1984,80s
2,1985,80s
3,1985,80s
4,1987,80s
...,...,...
35947,2013,10s
35948,2014,10s
35949,2015,10s
35950,2016,10s


In [72]:
cars.loc[:,['Year','Decade']]

Unnamed: 0,Year,Decade
0,1984,80s
1,1984,80s
2,1985,80s
3,1985,80s
4,1987,80s
...,...,...
35947,2013,10s
35948,2014,10s
35949,2015,10s
35950,2016,10s


In [73]:
cars["Year_range"]= pd.cut(cars["Year"], 
                             bins = [1980,1989,1999,2009,2019],
                             labels=["80s", "90s", "00s", "10s"])

cars.loc[:,['Year','Decade','Year_range']]

Unnamed: 0,Year,Decade,Year_range
0,1984,80s,80s
1,1984,80s,80s
2,1985,80s,80s
3,1985,80s,80s
4,1987,80s,80s
...,...,...,...
35947,2013,10s,10s
35948,2014,10s,10s
35949,2015,10s,10s
35950,2016,10s,10s


Did cars consume more gas in the eighties?

In [75]:
cars.columns

Index(['Brand', 'Model', 'Year', 'Engine_cc', 'Cyl', 'Trans', 'Drivetrain',
       'Class', 'Fuel_Type', 'Barrels_per_year', 'Fuel_Cost/Year',
       'CO2_Emission_Grams/Km', 'City_Km/Liter', 'Highway_Km/Liter', 'Decade',
       'Year_range'],
      dtype='object')

In [76]:
cars.groupby(["Year_range"])[['Year_range',"City_Km/Liter"]].mean("City_Km/Liter")  

Unnamed: 0_level_0,City_Km/Liter
Year_range,Unnamed: 1_level_1
80s,7.366539
90s,7.207911
00s,7.15694
10s,8.310167


Which brands are more environment friendly?

In [77]:
cars.groupby(["Decade","Brand"])[["CO2_Emission_Grams/Km"]].mean("CO2_Emission_Grams/Km")

Unnamed: 0_level_0,Unnamed: 1_level_0,CO2_Emission_Grams/Km
Decade,Brand,Unnamed: 2_level_1
80s,AMG,379.881345
80s,ASC,345.133719
80s,Acura,268.497682
80s,Alfa Romeo,286.715163
80s,American Motors Corporation,314.264744
...,...,...
10s,Volkswagen,219.440984
10s,Volvo,250.429309
10s,Wallace Environmental,
10s,Yugo,


Does the drivetrain affect fuel consumption?

In [78]:
# We can also sort by 2 columns 
# (the second column only matters in case there's a tie sorting by the first one)
cars.groupby("Drivetrain")[["Highway_Km/Liter","City_Km/Liter"]].mean().sort_values("City_Km/Liter",ascending=False)

Unnamed: 0_level_0,Highway_Km/Liter,City_Km/Liter
Drivetrain,Unnamed: 1_level_1,Unnamed: 2_level_1
"2-Wheel Drive, Front",14.029714,10.628571
Front-Wheel Drive,12.16621,9.002214
All-Wheel Drive,10.882531,7.785598
4-Wheel Drive,9.668584,7.190861
2-Wheel Drive,8.222444,6.64248
Rear-Wheel Drive,9.023946,6.556574
4-Wheel or All-Wheel Drive,8.34713,6.392049
Part-time 4-Wheel Drive,8.115385,6.215696


Do cars with automatic transmission consume more fuel than cars with manual transmission?

In [79]:
cars.columns

Index(['Brand', 'Model', 'Year', 'Engine_cc', 'Cyl', 'Trans', 'Drivetrain',
       'Class', 'Fuel_Type', 'Barrels_per_year', 'Fuel_Cost/Year',
       'CO2_Emission_Grams/Km', 'City_Km/Liter', 'Highway_Km/Liter', 'Decade',
       'Year_range'],
      dtype='object')

In [80]:
cars.groupby("Trans")[["City_Km/Liter"]].mean().sort_values("City_Km/Liter",ascending=False)

Unnamed: 0_level_0,City_Km/Liter
Trans,Unnamed: 1_level_1
Manual,7.968348
Automatic,7.278292


Use `groupby` and `aggregate` with different aggregation measures for different columns:

In [82]:
cars.groupby("Trans").aggregate({"City_Km/Liter":"mean","Trans":"count"})

Unnamed: 0_level_0,City_Km/Liter,Trans
Trans,Unnamed: 1_level_1,Unnamed: 2_level_1
Automatic,7.278292,24290
Manual,7.968348,11662


In [101]:
cars.groupby("Trans").aggregate({"City_Km/Liter":"mean","City_Km/Liter":"min"})

Unnamed: 0_level_0,City_Km/Liter
Trans,Unnamed: 1_level_1
Automatic,2.976
Manual,2.550857


We want to use "Drivetrain" in a statistical model. Convert the column to numeric using `get_dummies()`.

In [83]:
cars['Drivetrain'].unique()

array(['2-Wheel Drive', 'Rear-Wheel Drive', 'Front-Wheel Drive',
       '4-Wheel or All-Wheel Drive', 'All-Wheel Drive', '4-Wheel Drive',
       'Part-time 4-Wheel Drive', '2-Wheel Drive, Front'], dtype=object)

In [84]:
cars_drive = pd.get_dummies(cars['Drivetrain'])
cars_drive

Unnamed: 0,2-Wheel Drive,"2-Wheel Drive, Front",4-Wheel Drive,4-Wheel or All-Wheel Drive,All-Wheel Drive,Front-Wheel Drive,Part-time 4-Wheel Drive,Rear-Wheel Drive
0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
35947,0,0,0,0,0,0,0,1
35948,0,0,0,0,0,0,0,1
35949,0,0,0,0,0,0,0,1
35950,0,0,0,0,0,0,0,1


Use `pd.concat` to join the dummy columns with the rest of the columns.

In [85]:
pd.concat([cars,cars_drive], axis=0)

Unnamed: 0,Brand,Model,Year,Engine_cc,Cyl,Trans,Drivetrain,Class,Fuel_Type,Barrels_per_year,Fuel_Cost/Year,CO2_Emission_Grams/Km,City_Km/Liter,Highway_Km/Liter,Decade,Year_range,2-Wheel Drive,"2-Wheel Drive, Front",4-Wheel Drive,4-Wheel or All-Wheel Drive,All-Wheel Drive,Front-Wheel Drive,Part-time 4-Wheel Drive,Rear-Wheel Drive
0,AMG,DJ Po Vehicle 2WD,1984.0,2.5,4.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,1950.0,324.831736,7.652571,7.227428,80s,80s,,,,,,,,
1,AMG,FJ8c Post Office,1984.0,4.2,6.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550.0,424.779962,5.526857,5.526857,80s,80s,,,,,,,,
2,AMG,Post Office DJ5 2WD,1985.0,2.5,4.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,2100.0,345.133719,6.802286,7.227428,80s,80s,,,,,,,,
3,AMG,Post Office DJ8 2WD,1985.0,4.2,6.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550.0,424.779962,5.526857,5.526857,80s,80s,,,,,,,,
4,ASC,GNX,1987.0,3.8,6.0,Automatic,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,2550.0,345.133719,5.952000,8.928000,80s,80s,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
35948,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
35949,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
35950,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [86]:
pd.concat([cars,cars_drive], axis=1)

Unnamed: 0,Brand,Model,Year,Engine_cc,Cyl,Trans,Drivetrain,Class,Fuel_Type,Barrels_per_year,Fuel_Cost/Year,CO2_Emission_Grams/Km,City_Km/Liter,Highway_Km/Liter,Decade,Year_range,2-Wheel Drive,"2-Wheel Drive, Front",4-Wheel Drive,4-Wheel or All-Wheel Drive,All-Wheel Drive,Front-Wheel Drive,Part-time 4-Wheel Drive,Rear-Wheel Drive
0,AMG,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,1950,324.831736,7.652571,7.227428,80s,80s,1,0,0,0,0,0,0,0
1,AMG,FJ8c Post Office,1984,4.2,6.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550,424.779962,5.526857,5.526857,80s,80s,1,0,0,0,0,0,0,0
2,AMG,Post Office DJ5 2WD,1985,2.5,4.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,2100,345.133719,6.802286,7.227428,80s,80s,0,0,0,0,0,0,0,1
3,AMG,Post Office DJ8 2WD,1985,4.2,6.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550,424.779962,5.526857,5.526857,80s,80s,0,0,0,0,0,0,0,1
4,ASC,GNX,1987,3.8,6.0,Automatic,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,2550,345.133719,5.952000,8.928000,80s,80s,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428,10s,10s,0,0,0,0,0,0,0,1
35948,smart,fortwo coupe,2014,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,150.993575,14.454857,16.155428,10s,10s,0,0,0,0,0,0,0,1
35949,smart,fortwo coupe,2015,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428,10s,10s,0,0,0,0,0,0,0,1
35950,smart,fortwo coupe,2016,0.9,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,152.857693,14.454857,16.580571,10s,10s,0,0,0,0,0,0,0,1


### Joins

A simple example with a small dataset.

In [87]:
people = pd.read_csv("/content/drive/MyDrive/OLD_CURR/UNIT2/DAY2/data/people.csv")

In [88]:
people.head()

Unnamed: 0,name,age,country
0,Pol,22,ES
1,Javi,20,ES
2,Maria,23,AR
3,Anna,24,FR
4,Anna,24,UK


In [89]:
people.rename(columns={"country":"country_code"}, inplace=True)
people

Unnamed: 0,name,age,country_code
0,Pol,22,ES
1,Javi,20,ES
2,Maria,23,AR
3,Anna,24,FR
4,Anna,24,UK
5,Javi,30,MA
6,Dog,2,XX


In [90]:
countries = pd.read_csv("/content/drive/MyDrive/OLD_CURR/UNIT2/DAY2/data/country_info.csv")
countries

Unnamed: 0,country,country_name,pop
0,ES,Spain,46
1,PT,Portugal,11
2,IT,Italy,61
3,AR,Argentina,43
4,FR,France,63
5,UK,United Kingdom,65
6,MA,Malta,2
7,GE,Germany,82


In [91]:
people.merge(right=countries, 
             how="left", 
             left_on="country_code",
             right_on="country")

Unnamed: 0,name,age,country_code,country,country_name,pop
0,Pol,22,ES,ES,Spain,46.0
1,Javi,20,ES,ES,Spain,46.0
2,Maria,23,AR,AR,Argentina,43.0
3,Anna,24,FR,FR,France,63.0
4,Anna,24,UK,UK,United Kingdom,65.0
5,Javi,30,MA,MA,Malta,2.0
6,Dog,2,XX,,,


#### Joining cars & car_brands

Join the cars dataframe with the car brands dataframe.

In [92]:
car_brands = pd.read_csv("/content/drive/MyDrive/OLD_CURR/UNIT2/DAY2/data/car_brands.csv")
car_brands

Unnamed: 0,brand,revenue,production
0,AM General,1537,1.002916
1,ASC Incorporated,232,1.628105
2,Acura,234,3.394481
3,Alfa Romeo,1174,2.313726
4,American Motors Corporation,1230,1.231024
...,...,...,...
122,Volkswagen,273,1.033316
123,Volvo,1312,0.057454
124,Wallace Environmental,277,5.744609
125,Yugo,508,0.520953


A "left" join and an "inner" join produce the same output, because the brands are the same in both tables:

In [93]:
cars.columns

Index(['Brand', 'Model', 'Year', 'Engine_cc', 'Cyl', 'Trans', 'Drivetrain',
       'Class', 'Fuel_Type', 'Barrels_per_year', 'Fuel_Cost/Year',
       'CO2_Emission_Grams/Km', 'City_Km/Liter', 'Highway_Km/Liter', 'Decade',
       'Year_range'],
      dtype='object')

In [94]:
cars['Brand'].unique()

array(['AMG', 'ASC', 'Acura', 'Alfa Romeo', 'American Motors Corporation',
       'Aston Martin', 'Audi', 'Aurora Cars Ltd', 'Autokraft Limited',
       'BMW', 'Bentley', 'Bertone', 'Bill Dovell Motor Car Company',
       'Bitter Gmbh and Co. Kg', 'Bugatti', 'Buick', 'CCC Engineering',
       'CX Automotive', 'Cadillac', 'Chevrolet', 'Chrysler',
       'Consulier Industries Inc', 'Dabryan Coach Builders Inc', 'Dacia',
       'Daewoo', 'Daihatsu', 'Dodge', 'Dutton', 'Eagle',
       'Environmental Rsch and Devp Corp', 'Evans Automobiles',
       'Excalibur Autos', 'Federal Coach', 'Ferrari', 'Fiat', 'Fisker',
       'Ford', 'GMC', 'General Motors', 'Genesis', 'Geo', 'Goldacre',
       'Grumman', 'Honda', 'Hummer', 'Hyundai',
       'Import Foreign Auto Sales Inc', 'Import Trade Services',
       'Infiniti', 'Isis Imports Ltd', 'Isuzu', 'J.K. Motors',
       'JBA Motorcars, Inc.', 'Jaguar', 'Jeep', 'Kia',
       'Laforza Automobile Inc', 'Lambda Control Systems', 'Lamborghini',
       'La

In [95]:
set(cars.Brand.unique()) == set(car_brands.brand.unique())

False

In [96]:
cars.merge(right=car_brands, how="left", left_on="Brand", right_on="brand")

Unnamed: 0,Brand,Model,Year,Engine_cc,Cyl,Trans,Drivetrain,Class,Fuel_Type,Barrels_per_year,Fuel_Cost/Year,CO2_Emission_Grams/Km,City_Km/Liter,Highway_Km/Liter,Decade,Year_range,brand,revenue,production
0,AMG,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,1950,324.831736,7.652571,7.227428,80s,80s,,,
1,AMG,FJ8c Post Office,1984,4.2,6.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550,424.779962,5.526857,5.526857,80s,80s,,,
2,AMG,Post Office DJ5 2WD,1985,2.5,4.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,2100,345.133719,6.802286,7.227428,80s,80s,,,
3,AMG,Post Office DJ8 2WD,1985,4.2,6.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550,424.779962,5.526857,5.526857,80s,80s,,,
4,ASC,GNX,1987,3.8,6.0,Automatic,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,2550,345.133719,5.952000,8.928000,80s,80s,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428,10s,10s,smart,447.0,2.229253
35948,smart,fortwo coupe,2014,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,150.993575,14.454857,16.155428,10s,10s,smart,447.0,2.229253
35949,smart,fortwo coupe,2015,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428,10s,10s,smart,447.0,2.229253
35950,smart,fortwo coupe,2016,0.9,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,152.857693,14.454857,16.580571,10s,10s,smart,447.0,2.229253


In [97]:
cars.merge(right=car_brands, how="inner", left_on="Brand", right_on="brand")

Unnamed: 0,Brand,Model,Year,Engine_cc,Cyl,Trans,Drivetrain,Class,Fuel_Type,Barrels_per_year,Fuel_Cost/Year,CO2_Emission_Grams/Km,City_Km/Liter,Highway_Km/Liter,Decade,Year_range,brand,revenue,production
0,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,1500,251.006341,8.502857,11.053714,90s,90s,Acura,234,3.394481
1,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,1400,230.089146,9.353143,11.904000,90s,90s,Acura,234,3.394481
2,Acura,2.2CL/3.0CL,1997,3.0,6.0,Automatic,Front-Wheel Drive,Subcompact Cars,Regular,16.480500,1650,276.106976,7.652571,11.053714,90s,90s,Acura,234,3.394481
3,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,1500,251.006341,8.077714,11.478857,90s,90s,Acura,234,3.394481
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,1400,230.089146,8.928000,12.329143,90s,90s,Acura,234,3.394481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35934,smart,fortwo coupe,2013,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428,10s,10s,smart,447,2.229253
35935,smart,fortwo coupe,2014,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,150.993575,14.454857,16.155428,10s,10s,smart,447,2.229253
35936,smart,fortwo coupe,2015,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428,10s,10s,smart,447,2.229253
35937,smart,fortwo coupe,2016,0.9,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,152.857693,14.454857,16.580571,10s,10s,smart,447,2.229253


But if we remove a brand from car_brands, an "inner" join outputs fewer rows:

In [98]:
new_car_brands = car_brands[car_brands['brand'] != "smart"]
new_car_brands

Unnamed: 0,brand,revenue,production
0,AM General,1537,1.002916
1,ASC Incorporated,232,1.628105
2,Acura,234,3.394481
3,Alfa Romeo,1174,2.313726
4,American Motors Corporation,1230,1.231024
...,...,...,...
121,Volga Associated Automobile,1965,0.043377
122,Volkswagen,273,1.033316
123,Volvo,1312,0.057454
124,Wallace Environmental,277,5.744609


In [99]:
cars.merge(right=new_car_brands, how="inner", left_on="Brand", right_on="brand")

Unnamed: 0,Brand,Model,Year,Engine_cc,Cyl,Trans,Drivetrain,Class,Fuel_Type,Barrels_per_year,Fuel_Cost/Year,CO2_Emission_Grams/Km,City_Km/Liter,Highway_Km/Liter,Decade,Year_range,brand,revenue,production
0,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,1500,251.006341,8.502857,11.053714,90s,90s,Acura,234,3.394481
1,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,1400,230.089146,9.353143,11.904000,90s,90s,Acura,234,3.394481
2,Acura,2.2CL/3.0CL,1997,3.0,6.0,Automatic,Front-Wheel Drive,Subcompact Cars,Regular,16.480500,1650,276.106976,7.652571,11.053714,90s,90s,Acura,234,3.394481
3,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,1500,251.006341,8.077714,11.478857,90s,90s,Acura,234,3.394481
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,1400,230.089146,8.928000,12.329143,90s,90s,Acura,234,3.394481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35914,Yugo,GV Plus/GV/Cabrio,1990,1.3,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,13.184400,1350,220.885580,9.778286,11.904000,80s,90s,Yugo,508,0.520953
35915,Yugo,GV/GVX,1987,1.1,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,12.677308,1300,212.389981,10.203428,12.329143,80s,80s,Yugo,508,0.520953
35916,Yugo,GV/GVX,1989,1.1,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,12.677308,1300,212.389981,10.203428,12.329143,80s,80s,Yugo,508,0.520953
35917,Yugo,GV/GVX,1989,1.3,4.0,Manual,Front-Wheel Drive,Subcompact Cars,Regular,13.184400,1350,220.885580,9.778286,11.904000,80s,80s,Yugo,508,0.520953
