In [1]:
# Sometimes the dataset may contain adjacent groups of rows that share common properties.
# In some scenarios, such as time series datasets, this is normal. But in other scenarios, this may 
# throw off the model training process. In those cases, we may wish to shuffle the rows in the
# dataset before we segment the data for the purpose of training, testing, and validation.  


import pandas as pd
import numpy as np


# Prep a dataframe that will hold basic employee info and a header

# column names
header = ['Name', 'Age', 'City', 'State'] 

# dataframe values
employee = [('Steve', 45, 'Richmond', 'VA'), 
            ('Susan', 48, 'San Diego', 'CA'),
            ('Bill', 32, 'Herndon', 'VA'),
            ('Mary', 28, 'Boston', 'MA'),
            ('Robert', 28, 'New York', 'NY'),
            ('Caroline', 41, 'Dallas', 'TX')]

# create the datafarme 
df = pd.DataFrame(employee,columns=header)
df

Unnamed: 0,Name,Age,City,State
0,Steve,45,Richmond,VA
1,Susan,48,San Diego,CA
2,Bill,32,Herndon,VA
3,Mary,28,Boston,MA
4,Robert,28,New York,NY
5,Caroline,41,Dallas,TX


In [2]:
# Shuffle the rows randomly 
# The index values will be out of sequence. If that's OK, skip the next cell 

df = df.reindex(np.random.permutation(df.index)) 
df 

Unnamed: 0,Name,Age,City,State
2,Bill,32,Herndon,VA
1,Susan,48,San Diego,CA
4,Robert,28,New York,NY
3,Mary,28,Boston,MA
0,Steve,45,Richmond,VA
5,Caroline,41,Dallas,TX


In [3]:
# Reset the dataframe index to increment sequentially from 0 to number of rows-1

df = df.reset_index(drop=True) 
df

Unnamed: 0,Name,Age,City,State
0,Bill,32,Herndon,VA
1,Susan,48,San Diego,CA
2,Robert,28,New York,NY
3,Mary,28,Boston,MA
4,Steve,45,Richmond,VA
5,Caroline,41,Dallas,TX


In [4]:
# Thank you
# Mike Bitar