In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Data Preparation Basics

## Filtering and selecting data

In [2]:
# Creates an array of sequential numbers from 0 to 7
np.arange(8)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [3]:
# Setting seed for random generator
np.random.seed(25)

# Generates array of 36 random numbers
np.random.rand(36)

array([0.87012414, 0.58227693, 0.27883894, 0.18591123, 0.41110013,
       0.11737555, 0.68496874, 0.43761106, 0.55622933, 0.36708032,
       0.40236573, 0.1130407 , 0.44703085, 0.58544512, 0.1619851 ,
       0.52071879, 0.32605113, 0.69918624, 0.36639455, 0.83637451,
       0.48134294, 0.5165023 , 0.38304813, 0.9975409 , 0.51424449,
       0.55905327, 0.03444977, 0.71993003, 0.42100355, 0.43693513,
       0.28170075, 0.90027434, 0.66961228, 0.45606875, 0.28980434,
       0.52581896])

In [4]:
# Reshaping array into (6X6) matrix
np.random.rand(36).reshape(6,6)

array([[0.55924206, 0.74528383, 0.82834625, 0.82369445, 0.07714032,
        0.64486207],
       [0.30925759, 0.52425372, 0.95809234, 0.88320096, 0.29543189,
        0.51237599],
       [0.08870242, 0.64171673, 0.13242055, 0.76648581, 0.07674224,
        0.33104382],
       [0.67985159, 0.50921308, 0.65514614, 0.60212036, 0.71905456,
        0.41521924],
       [0.39654163, 0.82513896, 0.71255175, 0.09793715, 0.84215426,
        0.44082109],
       [0.37398924, 0.91367582, 0.54777802, 0.25193726, 0.02747394,
        0.2062573 ]])

In [5]:
# Reshaping array into (4X9) matrix
np.random.rand(36).reshape(4,9)

array([[0.59088536, 0.16365153, 0.83692816, 0.77520317, 0.16904146,
        0.76699433, 0.33536585, 0.47239795, 0.21506437],
       [0.912094  , 0.75920765, 0.67656136, 0.02137628, 0.66087433,
        0.09443959, 0.83116257, 0.11274904, 0.56682961],
       [0.17462609, 0.79060663, 0.0336828 , 0.79597118, 0.68943727,
        0.49184566, 0.08855377, 0.93755026, 0.08436223],
       [0.46939367, 0.80561265, 0.08564645, 0.24437952, 0.89280559,
        0.47861108, 0.19040068, 0.25304423, 0.76233925]])

In [6]:
# Creates a Series object of 8 integers from 0 to 7 and labelling them with row indexing
series_obj = Series(np.arange(8), index=['row1', 'row2', 'row3', 'row4', 'row5', 'row6', 'row7', 'row8'])
series_obj

row1    0
row2    1
row3    2
row4    3
row5    4
row6    5
row7    6
row8    7
dtype: int32

In [7]:
# Retrieving Series data using labelled index 
# Retrieving data at 'row5' label
series_obj['row5']

4

In [8]:
# Retrieving Series data using integer index 
# Retrievng data at integer indexes 1, 5, 7
series_obj[[1, 5, 7]]

  series_obj[[1, 5, 7]]


row2    1
row6    5
row8    7
dtype: int32

In [9]:
# Creates a DataFrame object and labelling them along rows using 'index=' and along columns using 'columns='
DF_obj = DataFrame(np.random.rand(36).reshape(6,6), index=['row1', 'row2', 'row3', 'row4', 'row5', 'row6'], 
                   columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6'])
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.95854,0.762318,0.320007,0.791581,0.489521,0.296351
row2,0.601961,0.221951,0.386073,0.006598,0.358175,0.196714
row3,0.409622,0.109562,0.085703,0.747596,0.18301,0.015087
row4,0.913751,0.925941,0.58287,0.31262,0.998268,0.150014
row5,0.994481,0.124964,0.351055,0.531058,0.323604,0.806417
row6,0.960319,0.539474,0.499325,0.801124,0.797099,0.119866


In [10]:
DF_obj = DataFrame(np.random.rand(36).reshape(4,9), index=['row1', 'row2', 'row3', 'row4'], 
                   columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9'])
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9
row1,0.245986,0.626783,0.37725,0.063457,0.373199,0.074116,0.044442,0.293012,0.754543
row2,0.953247,0.588749,0.98794,0.198074,0.042969,0.383704,0.780941,0.763117,0.864242
row3,0.53724,0.526677,0.815957,0.129121,0.72028,0.066831,0.015341,0.546204,0.092189
row4,0.171704,0.333604,0.116719,0.562587,0.410931,0.285842,0.398147,0.427212,0.885826


In [11]:
DF_obj = DataFrame(np.random.rand(36).reshape(9,4), index=['row1', 'row2', 'row3', 'row4', 'row5', 'row6', 'row7', 'row8', 'row9'], 
                   columns=['col1', 'col2', 'col3', 'col4'])
DF_obj

Unnamed: 0,col1,col2,col3,col4
row1,0.437051,0.61706,0.055321,0.928544
row2,0.757259,0.975881,0.251659,0.968149
row3,0.139969,0.711813,0.069177,0.605039
row4,0.059525,0.362887,0.258474,0.026127
row5,0.73258,0.451459,0.86319,0.529924
row6,0.848966,0.319436,0.319598,0.260322
row7,0.424988,0.892367,0.479706,0.348571
row8,0.682153,0.162011,0.948836,0.470992
row9,0.819737,0.070855,0.106414,0.507698


In [12]:
# Retrieving data from DataFrame using .loc from particular row and column
DF_obj.loc['row5', 'col2']

0.45145854848872025

In [13]:
# Retrieving data from DataFrame using .loc from more than one rows and column
DF_obj.loc[['row4', 'row5', 'row6'], ['col2', 'col3']]

Unnamed: 0,col2,col3
row4,0.362887,0.258474
row5,0.451459,0.86319
row6,0.319436,0.319598


In [14]:
# Data Slicing in Series
series_obj['row3' : 'row7']

row3    2
row4    3
row5    4
row6    5
row7    6
dtype: int32

In [15]:
# Data Slicing in DataFrame
DF_obj['row4' : 'row7']

Unnamed: 0,col1,col2,col3,col4
row4,0.059525,0.362887,0.258474,0.026127
row5,0.73258,0.451459,0.86319,0.529924
row6,0.848966,0.319436,0.319598,0.260322
row7,0.424988,0.892367,0.479706,0.348571


In [16]:
# Retrieving data using .loc along with slicing
DF_obj.loc['row4':'row7', 'col2':'col3']

Unnamed: 0,col2,col3
row4,0.362887,0.258474
row5,0.451459,0.86319
row6,0.319436,0.319598
row7,0.892367,0.479706


In [17]:
# Comparing with Scalar value in Series 
# return a Boolean value of True or False for all of the values in series_obj whether or not they are less than 4:
series_obj < 4

row1     True
row2     True
row3     True
row4     True
row5    False
row6    False
row7    False
row8    False
dtype: bool

In [18]:
# Comparing with Scalar value in DataFrame
# return a Boolean value of True or False for all of the values in DF_obj whether or not they are less than .2:
DF_obj < .2

Unnamed: 0,col1,col2,col3,col4
row1,False,False,True,False
row2,False,False,False,False
row3,True,False,True,False
row4,True,False,False,True
row5,False,False,False,False
row6,False,False,False,False
row7,False,False,False,False
row8,False,True,False,False
row9,False,True,True,False


In [19]:
# Filtering with Scaler value in Series
# Return only those records in series_obj which are less than 5
series_obj[series_obj < 5]

row1    0
row2    1
row3    2
row4    3
row5    4
dtype: int32

In [20]:
# Filtering with Scaler value in DataFrame
# Prints only those records in DF_obj which are less than .2. For remaining prints NaN
DF_obj[DF_obj < .2]

Unnamed: 0,col1,col2,col3,col4
row1,,,0.055321,
row2,,,,
row3,0.139969,,0.069177,
row4,0.059525,,,0.026127
row5,,,,
row6,,,,
row7,,,,
row8,,0.162011,,
row9,,0.070855,0.106414,


In [21]:
# Setting values with Scalar in Series 
series_obj['row5'] = 10
series_obj

row1     0
row2     1
row3     2
row4     3
row5    10
row6     5
row7     6
row8     7
dtype: int32

In [22]:
DF_obj = DataFrame(np.random.rand(36).reshape(6,6), index=['row1', 'row2', 'row3', 'row4', 'row5', 'row6'], 
                   columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6'])
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.431639,0.393481,0.463856,0.488138,0.317365,0.53702
row2,0.760302,0.866141,0.798727,0.028147,0.525692,0.6706
row3,0.375215,0.075203,0.669065,0.59033,0.73463,0.366946
row4,0.00753,0.2893,0.402361,0.628927,0.450503,0.453651
row5,0.0201,0.962513,0.686381,0.461716,0.256673,0.421346
row6,0.419191,0.551406,0.658048,0.278344,0.246718,0.648144


In [23]:
# Setting values with Scalar in DataFrame 
DF_obj.loc['row2', 'col3'] = 15
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.431639,0.393481,0.463856,0.488138,0.317365,0.53702
row2,0.760302,0.866141,15.0,0.028147,0.525692,0.6706
row3,0.375215,0.075203,0.669065,0.59033,0.73463,0.366946
row4,0.00753,0.2893,0.402361,0.628927,0.450503,0.453651
row5,0.0201,0.962513,0.686381,0.461716,0.256673,0.421346
row6,0.419191,0.551406,0.658048,0.278344,0.246718,0.648144


In [24]:
# Setting values with Scalar in DataFrame using Slicing
DF_obj.loc['row4': 'row6', 'col5': 'col6'] = 25
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.431639,0.393481,0.463856,0.488138,0.317365,0.53702
row2,0.760302,0.866141,15.0,0.028147,0.525692,0.6706
row3,0.375215,0.075203,0.669065,0.59033,0.73463,0.366946
row4,0.00753,0.2893,0.402361,0.628927,25.0,25.0
row5,0.0201,0.962513,0.686381,0.461716,25.0,25.0
row6,0.419191,0.551406,0.658048,0.278344,25.0,25.0


## Treating missing values

In [25]:
# NaN from numpy library
missing = np.nan
# Creating Series object with missing values
series_obj = Series(['row1', 'row2', missing, 'row4', 'row5', missing, 'row7', missing]) 
series_obj

0    row1
1    row2
2     NaN
3    row4
4    row5
5     NaN
6    row7
7     NaN
dtype: object

In [26]:
# isnull() method returns boolean True/False value against each row index whether it is null or not 
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7     True
dtype: bool

In [27]:
DF_obj = DataFrame(np.random.rand(36).reshape(6,6), index=['row1', 'row2', 'row3', 'row4', 'row5', 'row6'], 
                   columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6'])
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.033211,0.880092,0.401461,0.822217,0.738945,0.748786
row2,0.060813,0.965365,0.647643,0.678877,0.355455,0.164828
row3,0.040824,0.546855,0.996076,0.512859,0.646614,0.576211
row4,0.941954,0.582703,0.484153,0.408028,0.136195,0.158547
row5,0.82983,0.177429,0.243227,0.205815,0.799864,0.464478
row6,0.02143,0.274033,0.127639,0.597566,0.419808,0.264839


In [28]:
# Filling DataFrame with null values
DF_obj.loc['row4':'row6', 'col2'] = missing
DF_obj.loc['row2':'row5', 'col5'] = missing
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.033211,0.880092,0.401461,0.822217,0.738945,0.748786
row2,0.060813,0.965365,0.647643,0.678877,,0.164828
row3,0.040824,0.546855,0.996076,0.512859,,0.576211
row4,0.941954,,0.484153,0.408028,,0.158547
row5,0.82983,,0.243227,0.205815,,0.464478
row6,0.02143,,0.127639,0.597566,0.419808,0.264839


In [29]:
# Returning True/ False for missing values in DataFrame
DF_obj.isnull()

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,False,False,False,False,False,False
row2,False,False,False,False,True,False
row3,False,False,False,False,True,False
row4,False,True,False,False,True,False
row5,False,True,False,False,True,False
row6,False,True,False,False,False,False


In [30]:
# Using fillna() method to fill the missing values with any particular value in Series
# After filling, asssign this to same or different series object
filledSeries = series_obj.fillna('rowNA')
filledSeries

0     row1
1     row2
2    rowNA
3     row4
4     row5
5    rowNA
6     row7
7    rowNA
dtype: object

In [31]:
# Using fillna() method to fill the missing values with any particular value in DataFrame
# After filling, asssign this to same or different DataFrame object
filledDF = DF_obj.fillna(0)
filledDF

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.033211,0.880092,0.401461,0.822217,0.738945,0.748786
row2,0.060813,0.965365,0.647643,0.678877,0.0,0.164828
row3,0.040824,0.546855,0.996076,0.512859,0.0,0.576211
row4,0.941954,0.0,0.484153,0.408028,0.0,0.158547
row5,0.82983,0.0,0.243227,0.205815,0.0,0.464478
row6,0.02143,0.0,0.127639,0.597566,0.419808,0.264839


In [32]:
# Filling the missing values in dataframe by passing as dictionary {key: value} to fillna() method
# key will represent the column label along which NaN will be filled with value
# After filling, asssign this to same or different DataFrame object
filledDF = DF_obj.fillna({'col2': 0.3, 'col5': 1.45})
filledDF

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.033211,0.880092,0.401461,0.822217,0.738945,0.748786
row2,0.060813,0.965365,0.647643,0.678877,1.45,0.164828
row3,0.040824,0.546855,0.996076,0.512859,1.45,0.576211
row4,0.941954,0.3,0.484153,0.408028,1.45,0.158547
row5,0.82983,0.3,0.243227,0.205815,1.45,0.464478
row6,0.02143,0.3,0.127639,0.597566,0.419808,0.264839


In [33]:
# By passing 'method=ffill' argument in fillna() method, it will fill forward any missing values with values from 
# the last non-null element in the column series
# After filling, asssign this to same or different DataFrame object
filledDF = DF_obj.fillna(method='ffill')
filledDF

  filledDF = DF_obj.fillna(method='ffill')


Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.033211,0.880092,0.401461,0.822217,0.738945,0.748786
row2,0.060813,0.965365,0.647643,0.678877,0.738945,0.164828
row3,0.040824,0.546855,0.996076,0.512859,0.738945,0.576211
row4,0.941954,0.546855,0.484153,0.408028,0.738945,0.158547
row5,0.82983,0.546855,0.243227,0.205815,0.738945,0.464478
row6,0.02143,0.546855,0.127639,0.597566,0.419808,0.264839


In [34]:
DF_obj

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.033211,0.880092,0.401461,0.822217,0.738945,0.748786
row2,0.060813,0.965365,0.647643,0.678877,,0.164828
row3,0.040824,0.546855,0.996076,0.512859,,0.576211
row4,0.941954,,0.484153,0.408028,,0.158547
row5,0.82983,,0.243227,0.205815,,0.464478
row6,0.02143,,0.127639,0.597566,0.419808,0.264839


In [35]:
# Using .isnull().sum() method to count the null values along each column index
DF_obj.isnull().sum()

col1    0
col2    3
col3    0
col4    0
col5    4
col6    0
dtype: int64

In [36]:
# Dropping all the rows containing null values using dropna() method
DF_obj.dropna()

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,0.033211,0.880092,0.401461,0.822217,0.738945,0.748786


In [37]:
# Dropping all the columns containing null values using dropna() method
DF_obj.dropna(axis=1)

Unnamed: 0,col1,col3,col4,col6
row1,0.033211,0.401461,0.822217,0.748786
row2,0.060813,0.647643,0.678877,0.164828
row3,0.040824,0.996076,0.512859,0.576211
row4,0.941954,0.484153,0.408028,0.158547
row5,0.82983,0.243227,0.205815,0.464478
row6,0.02143,0.127639,0.597566,0.264839


## Removing duplicates

In [38]:
# Creating DataFrame object using dictionary of columns
DF_obj= DataFrame({'column 1':[1,1,2,2,3,3,3],
                   'column 2':['a', 'a','b', 'b', 'c', 'c', 'c'],
                   'column 3':['A', 'A', 'B', 'B', 'C', 'C', 'C']})
DF_obj

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
1,1,a,A
2,2,b,B
3,2,b,B
4,3,c,C
5,3,c,C
6,3,c,C


In [39]:
# The duplicated() method of DataFrame searches each row in the data frame and returns a true or false value to indicate 
# whether it's a duplicate of another row found earlier in the data frame
DF_obj.duplicated()

0    False
1     True
2    False
3     True
4    False
5     True
6     True
dtype: bool

In [40]:
# Drop all the duplicate rows from the Dataframe
DF_obj.drop_duplicates()

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
2,2,b,B
4,3,c,C


In [41]:
DF_obj= DataFrame({'column 1':[1,1,2,2,3,3,3],
                   'column 2':['a', 'a','b', 'b', 'c', 'c', 'c'],
                   'column 3':['A', 'A', 'B', 'B', 'C', 'D', 'C']})
DF_obj

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
1,1,a,A
2,2,b,B
3,2,b,B
4,3,c,C
5,3,c,D
6,3,c,C


In [42]:
# Dropping duplicate rows based on 'column 3'
DF_obj.drop_duplicates('column 3')

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
2,2,b,B
4,3,c,C
5,3,c,D


## Concatenating and transforming data

### Concatenating data

In [43]:
DF_obj1 = DataFrame(np.arange(36).reshape(6,6))
DF_obj1

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29
5,30,31,32,33,34,35


In [44]:
DF_obj2 = DataFrame(np.arange(12).reshape(3,4))
DF_obj2

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [45]:
# Joining 2 DataFrames using pd.concat() based on their row index values (Horizontally)
pd.concat([DF_obj1, DF_obj2], axis=1)

Unnamed: 0,0,1,2,3,4,5,0.1,1.1,2.1,3.1
0,0,1,2,3,4,5,0.0,1.0,2.0,3.0
1,6,7,8,9,10,11,4.0,5.0,6.0,7.0
2,12,13,14,15,16,17,8.0,9.0,10.0,11.0
3,18,19,20,21,22,23,,,,
4,24,25,26,27,28,29,,,,
5,30,31,32,33,34,35,,,,


In [46]:
# Joining 2 DataFrames using pd.concat() based on their column index values (Vertically)
pd.concat([DF_obj1, DF_obj2])

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4.0,5.0
1,6,7,8,9,10.0,11.0
2,12,13,14,15,16.0,17.0
3,18,19,20,21,22.0,23.0
4,24,25,26,27,28.0,29.0
5,30,31,32,33,34.0,35.0
0,0,1,2,3,,
1,4,5,6,7,,
2,8,9,10,11,,


### Transforming data
#### Dropping data

In [47]:
# Dropping rows from index 0 and 2 from DataFrame 
DF_obj1.drop([1, 3])

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
2,12,13,14,15,16,17
4,24,25,26,27,28,29
5,30,31,32,33,34,35


In [48]:
# Dropping columns from index 2 and 5 from DataFrame 
DF_obj1.drop([2, 5], axis=1)

Unnamed: 0,0,1,3,4
0,0,1,3,4
1,6,7,9,10
2,12,13,15,16
3,18,19,21,22
4,24,25,27,28
5,30,31,33,34


#### Adding data

In [49]:
# Creating Series of 6 values ranging from 0 to 5 and naming it as 'added_series'
series_obj = Series(np.arange(6))
series_obj.name = "added_series"
series_obj

0    0
1    1
2    2
3    3
4    4
5    5
Name: added_series, dtype: int32

In [50]:
# Joining DataFrame object DF_obj and Series object series_obj using join() method of DataFrame
# and storing resulting DataFrame object as added_DF
added_DF = DataFrame.join(DF_obj1, series_obj)
added_DF                         

Unnamed: 0,0,1,2,3,4,5,added_series
0,0,1,2,3,4,5,0
1,6,7,8,9,10,11,1
2,12,13,14,15,16,17,2
3,18,19,20,21,22,23,3
4,24,25,26,27,28,29,4
5,30,31,32,33,34,35,5


In [51]:
# Appending DataFrame to itself using append() method - Deprecated and removed now. 
# We need to use concat() instead 
# By passing 'ignore_index=False' leaves original index values in place after appending
# Storing the appended DataFrame as 'added_Datatable'

# added_Datatable = added_DF.append(added_DF, ignore_index=False)  --> Deprecated and removed
added_Datatable = pd.concat([added_DF, added_DF], ignore_index=False)
added_Datatable

Unnamed: 0,0,1,2,3,4,5,added_series
0,0,1,2,3,4,5,0
1,6,7,8,9,10,11,1
2,12,13,14,15,16,17,2
3,18,19,20,21,22,23,3
4,24,25,26,27,28,29,4
5,30,31,32,33,34,35,5
0,0,1,2,3,4,5,0
1,6,7,8,9,10,11,1
2,12,13,14,15,16,17,2
3,18,19,20,21,22,23,3


In [52]:
# By passing 'ignore_index=True' changes original index values after appending
# Storing the appended DataFrame as 'added_Datatable'

# added_Datatable = added_DF.append(added_DF, ignore_index=True) --> Deprecated and removed
added_Datatable = pd.concat([added_DF, added_DF], ignore_index=True)
added_Datatable

Unnamed: 0,0,1,2,3,4,5,added_series
0,0,1,2,3,4,5,0
1,6,7,8,9,10,11,1
2,12,13,14,15,16,17,2
3,18,19,20,21,22,23,3
4,24,25,26,27,28,29,4
5,30,31,32,33,34,35,5
6,0,1,2,3,4,5,0
7,6,7,8,9,10,11,1
8,12,13,14,15,16,17,2
9,18,19,20,21,22,23,3


#### Sorting data

In [53]:
# For sorting the values in DataFrame, we can all sort_values() method and pass in two arguments - 'by' and 'ascending'
# 'by' is the column value along which we want to sort the DataFrame
# 'ascending' is the boolean value True/False for sorting in ascending order
DF_sorted = DF_obj1.sort_values(by=(2), ascending=False)
DF_sorted

Unnamed: 0,0,1,2,3,4,5
5,30,31,32,33,34,35
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


## Grouping and data aggregation

In [54]:
import os
path = os.getcwd()
# Importing an External Dataset present at following address
address = "/Data/mtcars.csv"

# Reading the CSV file present at above address using pd.read_csv method
cars = pd.read_csv(path+address)

# Assigning column names to DataFrame 'cars'
cars.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']

# Printing first 5 records from DataFrame using head() method 
cars.head()

Unnamed: 0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [55]:
# Grouping the 'cars' DataFrame using the 'cyl' column
cars_groups = cars.groupby('cyl')
cars_groups

# Printing Grouped DataFrame
for key, items in cars_groups:
    print(items)

# Another way to print
# for key, items in cars_groups:
#     print(cars_groups.get_group(key))

         car_names   mpg  cyl   disp   hp  drat     wt   qsec  vs  am  gear  \
2       Datsun 710  22.8    4  108.0   93  3.85  2.320  18.61   1   1     4   
7        Merc 240D  24.4    4  146.7   62  3.69  3.190  20.00   1   0     4   
8         Merc 230  22.8    4  140.8   95  3.92  3.150  22.90   1   0     4   
17        Fiat 128  32.4    4   78.7   66  4.08  2.200  19.47   1   1     4   
18     Honda Civic  30.4    4   75.7   52  4.93  1.615  18.52   1   1     4   
19  Toyota Corolla  33.9    4   71.1   65  4.22  1.835  19.90   1   1     4   
20   Toyota Corona  21.5    4  120.1   97  3.70  2.465  20.01   1   0     3   
25       Fiat X1-9  27.3    4   79.0   66  4.08  1.935  18.90   1   1     4   
26   Porsche 914-2  26.0    4  120.3   91  4.43  2.140  16.70   0   1     5   
27    Lotus Europa  30.4    4   95.1  113  3.77  1.513  16.90   1   1     5   
31      Volvo 142E  21.4    4  121.0  109  4.11  2.780  18.60   1   1     4   

    carb  
2      1  
7      2  
8      2  
17     

In [56]:
# Generating mean values for all the variables in the cars dataframe, after they've been grouped by the number of cylinders 'cyl'
cars_groups[['mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']].mean()

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4,26.663636,4.0,105.136364,82.636364,4.070909,2.285727,19.137273,0.909091,0.727273,4.090909,1.545455
6,19.742857,6.0,183.314286,122.285714,3.585714,3.117143,17.977143,0.571429,0.428571,3.857143,3.428571
8,15.1,8.0,353.1,209.214286,3.229286,3.999214,16.772143,0.0,0.142857,3.285714,3.5


In [57]:
# Creating a subset of cars datatset 
cars_subset = cars[['car_names', 'mpg', 'disp', 'wt']]
cars_subset

Unnamed: 0,car_names,mpg,disp,wt
0,Mazda RX4,21.0,160.0,2.62
1,Mazda RX4 Wag,21.0,160.0,2.875
2,Datsun 710,22.8,108.0,2.32
3,Hornet 4 Drive,21.4,258.0,3.215
4,Hornet Sportabout,18.7,360.0,3.44
5,Valiant,18.1,225.0,3.46
6,Duster 360,14.3,360.0,3.57
7,Merc 240D,24.4,146.7,3.19
8,Merc 230,22.8,140.8,3.15
9,Merc 280,19.2,167.6,3.44
