### Shuffling Dataset 

In [1]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", 
    na_values=['NA', '?'])

In [4]:
np.random.seed(42) # Uncomment this line to get the same shuffle each time
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,34.4,4,98.0,65.0,2045,16.2,81,1,ford escort 4w
1,29.0,4,68.0,49.0,1867,19.5,73,2,fiat 128
2,28.4,4,151.0,90.0,2670,16.0,79,1,buick skylark limited
3,29.5,4,97.0,71.0,1825,12.2,76,2,volkswagen rabbit
4,30.0,4,135.0,84.0,2385,12.9,81,1,plymouth reliant
...,...,...,...,...,...,...,...,...,...
393,25.0,4,113.0,95.0,2228,14.0,71,3,toyota corona
394,34.1,4,91.0,68.0,1985,16.0,81,3,mazda glc 4
395,31.6,4,120.0,74.0,2635,18.3,81,3,mazda 626
396,27.4,4,121.0,80.0,2670,15.0,79,1,amc spirit dl


### Sorting a dataset 

In [5]:
import os
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", 
    na_values=['NA', '?'])

In [6]:
df = df.sort_values(by='name', ascending=True)

In [8]:
df.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
96,13.0,8,360.0,175.0,3821,11.0,73,1,amc ambassador brougham
9,15.0,8,390.0,190.0,3850,8.5,70,1,amc ambassador dpl
66,17.0,8,304.0,150.0,3672,11.5,72,1,amc ambassador sst
315,24.3,4,151.0,90.0,3003,20.1,80,1,amc concord
257,19.4,6,232.0,90.0,3210,17.2,78,1,amc concord
261,18.1,6,258.0,120.0,3410,15.1,78,1,amc concord d/l
374,23.0,4,151.0,,3035,20.5,82,1,amc concord dl
283,20.2,6,232.0,90.0,3265,18.2,79,1,amc concord dl 6
107,18.0,6,232.0,100.0,2789,15.0,73,1,amc gremlin
33,19.0,6,232.0,100.0,2634,13.0,71,1,amc gremlin


### Grouping a dataset
 Grouping is used to summarize data. Because of this summarization the row could will either stay the same or more likely shrink after a grouping is applied.

In [10]:
import os
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", 
    na_values=['NA', '?'])
display(df[0:5])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [12]:
list(df['cylinders'].unique())

[8, 4, 6, 3, 5]

In [14]:
gr = df.groupby(by='cylinders')['mpg'].mean()
gr

cylinders
3    20.550000
4    29.286765
5    27.366667
6    19.985714
8    14.963107
Name: mpg, dtype: float64

In [15]:
dic = gr.to_dict()
dic

{3: 20.55,
 4: 29.28676470588236,
 5: 27.366666666666664,
 6: 19.985714285714284,
 8: 14.963106796116508}

In [16]:
dic[6]

19.985714285714284

In [20]:
df.groupby(by='cylinders')['mpg'].count()

cylinders
3      4
4    204
5      3
6     84
8    103
Name: mpg, dtype: int64

In [19]:
df.groupby(by='cylinders')['mpg'].count().to_dict()

{3: 4, 4: 204, 5: 3, 6: 84, 8: 103}