# Chapter 7 - How to prepare the data

In [0]:
import pandas as pd

## How to work with datetime columns

In [0]:
fires = pd.read_pickle('fires_cleaned.pkl')

In [0]:
fires.head()

In [0]:
fires.info()

In [0]:
fires['fire_month'] = fires.discovery_date.dt.month

In [0]:
fires.fire_month.head(5)

In [0]:
#DATEDIFF(DAY,fires.contain_date,fires.discovery_date)
fires['days_burning'] = (fires.contain_date - fires.discovery_date).dt.days
#fires['hours_burning'] = (fires.contain_date - fires.discovery_date).dt.hours

fires.head()



In [0]:
fires.days_burning.describe()

In [0]:
fires.days_burning.hist(bins=20)

In [0]:
def statsoncol (indf,incol):
    return indf[[incol]].describe()

In [0]:
statsoncol (fires,"discovery_date")


In [0]:
fires.head()

## How to work with string and numeric columns

In [0]:
fires.fire_name.head(10)

In [0]:
fires['fire_name'] = fires.fire_name.str.title()
fires.head()

In [0]:
fires['full_name'] = 'The ' + fires.fire_name + ' Fire ' \
                   + '(' + fires.fire_year.astype(str) + ')'
fires.head()

In [0]:

statsoncol (fires,'acres_burned')


In [0]:
fires['acres_per_day'] = fires.query("days_burning !=0").dropna().acres_burned / fires.dropna().days_burning
statsoncol (fires,'acres_per_day')

In [0]:
fires.acres_per_day.hist(bins=1000)

In [0]:
fires.acres_per_day.quantile(0.99)

In [0]:
#len(fires.query("acres_per_day >1454.5"))
#2150
len(fires.query("acres_per_day > 0"))
# 42991

In [0]:
import numpy as np

In [0]:
fires['adjusted_acres_per_day'] = np.log10(fires.acres_per_day)

In [0]:
fires.adjusted_acres_per_day.hist(bins=1000)

In [0]:
import seaborn as sns
ax = sns.boxplot(data=fires, x = 'acres_per_day')


In [0]:
fires.acres_per_day.head(100)

In [0]:
fires.acres_per_day.fillna(value=fires.acres_per_day.mean(), inplace=True)
fires[['fire_name','full_name','acres_burned','days_burning','acres_per_day']].head()

## How to add summary columns

In [0]:
fires[['state','days_burning']].head()

In [0]:
fires.groupby('state')['days_burning'].mean()

In [0]:
fires['mean_days'] = fires.groupby('state')['days_burning'].transform(func='mean')
fires[['state','days_burning','mean_days']].head()

## How to apply functions to rows or columns

In [0]:
workData = pd.read_pickle('workData.pkl')
workData.head(3)

In [0]:
workData[["wrkstat","hrs1"]].apply('mean')

In [0]:
import numpy as np
workData[['sex','hrs1']].apply(np.mean)

In [0]:
workData['avg_rating'] = workData[
    ['wkcontct','talkspvs','effctsup']].apply(np.mean, axis=1)
workData.head(3)

## How to apply user-defined functions

In [0]:
#def convert_sex(row,col):
def convert_sex(row):
    #if row["col"]
    if row.sex == 1:
        return 'male'
    elif row.sex == 2:
        return 'female'
    else:
        return 'non-binary'
    
workData['sex'] = workData.apply(convert_sex, axis=1)
workData.head()

In [0]:
statsoncol (workData,'sex')

In [0]:
# get the data
gameData = pd.read_pickle('shot_cleaned.pkl')[['game_id','game_date']]
print('before :'+str(len(gameData)))
gameData[gameData.duplicated(keep=False)]
gameData.drop_duplicates(keep='first', inplace=True)
print('after :'+str(len(gameData)))
gameData

In [0]:
gameData.info()

In [0]:
def get_season(row):
    if row.game_date.month > 6:
        season = f'{row.game_date.year}-{row.game_date.year + 1}'
    else:
        season = f'{row.game_date.year - 1}-{row.game_date.year}'
    return season

gameData['season'] = gameData.apply(get_season, axis=1)
with pd.option_context('display.max_rows', 6, 'display.max_columns', None):
    display(gameData)

## How lambda expressions work with DataFrames

In [0]:
df = pd.DataFrame([[0,1,2],[3,4,5]], columns=['col1','col2','col3'])
df

In [0]:
df.apply(lambda x: x.sum() * 2, axis=0)

In [0]:
df.iloc[:,[0,2]].apply(lambda x: x.sum() * 2, axis=0)

In [0]:
df.apply(lambda x: x.sum() * 2, axis=1)

In [0]:
df.iloc[:,[0,2]].apply(lambda x: x.sum() * 2, axis=1)

## How to apply lambda expressions

In [0]:
statsoncol(workData,'wrkstat')

In [0]:
workData.wrkstat.value_counts()


In [0]:
workData.wrkstat.value_counts()

In [0]:
workData['wrkstat'] = workData.apply(
    lambda row: 'full-time' if row.wrkstat == 1.0 else 'part-time', axis=1)
workData.head()

In [0]:
carsData = pd.read_csv('cars.csv')
carsData.info()

In [0]:

carsData['Brand'] = carsData.apply(lambda x: x.CarName.split()[0], axis=1)
carsData[['CarName','Brand']].head()

## How to set or remove an index

In [0]:
fires_by_month = pd.read_pickle('fires_by_month.pkl')

In [0]:
fires_by_month.info()

In [0]:
fires_by_month.set_index('state', inplace=True)
fires_by_month.head(3)

In [0]:
fires_by_month = pd.read_pickle('fires_by_month.pkl')

In [0]:
fires_by_month.set_index(['state','fire_year','fire_month'], inplace=True)
fires_by_month.head(3)

In [0]:
fires_no_index = fires_by_month.reset_index()
fires_no_index.head(3)

## How to unstack indexed data

In [0]:
top5_states.info()

In [0]:
# get indexed dataset
top5_states = pd.read_pickle('top_states.pkl')
top5_states.head(3)

In [0]:
len(top5_states)

In [0]:
# unstack the state level
top_wide = top5_states[['days_burning','fire_count']].unstack(level='state')
# top_wide = top5_states[['days_burning','fire_count']].unstack(level=0)
top_wide.head(3)

In [0]:
top_wide = top5_states.unstack(level='state')
top_wide.head(3)

In [0]:
top_wide = top5_states.fire_count.unstack(level='state')
top_wide.head(3)

## How to join DataFrames

In [0]:
# get the shots DataFrame
allShotData = pd.read_pickle('shot_cleaned.pkl')
allShotData

In [0]:
shots = allShotData.drop(columns=['period','minutes_remaining',
                                  'seconds_remaining','loc_x','loc_y','home_team',
                                  'game_date','shot_attempted_flag','shot_made_flag',
                                  'action_type','visiting_team'])
shots1 = shots.head(2)
shots2 = shots.query('game_id == "0020900030"').head(1)
shots3 = shots.query('game_id == "0020900069"').head(1)
shots = pd.concat([shots1,shots2,shots3], ignore_index=True)
shots.set_index('game_id', inplace = True)
shots

In [0]:
# get the points_by_game DataFrame
points_by_game = pd.read_pickle('pointsScoredGame.pkl')
points_by_game = points_by_game.query('game_id == "0020900015" or game_id == "0020900030" or game_id == "0020900082"')
points_by_game

In [0]:
shots_joined = shots.join(points_by_game, how='inner')
shots_joined

In [0]:
points_by_game.join?

In [0]:
points_by_game

In [0]:
points_by_game2 = points_by_game.copy(deep='true')
points_by_game2['player_name'] = 'Steph Curry'
points_by_game2

In [0]:
shots_joined = shots.join(points_by_game2, lsuffix='_1', rsuffix='_2', 
                          how='left')
shots_joined

In [0]:
shots_joined_outer = shots.join(points_by_game2, lsuffix='_1', 
                                rsuffix='_2', how='outer')
shots_joined_outer

## Merge

In [0]:
shots2 = shots.reset_index()
shots2

In [0]:
points_by_game2 = points_by_game.reset_index()
points_by_game2

In [0]:
shots_merged = shots2.merge(points_by_game2,  how='cross')
shots_merged

## Concat

In [0]:
fires.head(3)

In [0]:
# get the data
top5_fires = fires.sort_values('acres_burned', ascending=False).head(5)
top5_fires = top5_fires.reset_index(drop=True)
top5_fires

In [0]:
fires_1 = top5_fires.iloc[:3]
fires_1

In [0]:
fires_2 = top5_fires.iloc[3:]
fires_2.reset_index(drop=True, inplace=True)
fires_2 = fires_2.drop(columns=['fire_month','days_burning'])
fires_2

In [0]:
fires_concat = pd.concat([fires_1,fires_2], ignore_index=True)
fires_concat.head(10)

## What the SettingWithCopyWarning is warning you about

In [0]:
df = shots.copy(deep=True)

In [0]:
df.head(3)

In [0]:
dfSlice = df.loc['0020900015',:]
dfSlice.loc[:,'player_name'] = 'Curry'

In [0]:
dfSlice

In [0]:
df.head(3)

In [0]:
dfSlice.head(3)

## When the SettingWithCopyWarning is given

### Generates the warning but no corruption

In [0]:
df = shots.copy(deep=True)

In [0]:
dfSlice = df.query('game_id == "0020900015"')
dfSlice.loc[:,'player_name'] = 'Curry'
df.head(2)

In [0]:
dfSlice.head(2)

#### How use the copy() method to stop the warning message

In [0]:
dfFixed = df.query('game_id == "0020900015"').copy()
dfFixed.loc[:,'player_name'] = 'Curry'

### Generates the warning and corrupts the data

In [0]:
df = shots.copy(deep=True)

In [0]:
dfSlice = df.loc['0020900015',:]
dfSlice.loc[:,'player_name'] = 'Curry'
df.head(2)

In [0]:
dfSlice.head(2)

#### How to fix this code

In [0]:
df = shots.copy(deep=True)

In [0]:
dfFixed = df.loc['0020900015',:].copy()
dfFixed.loc[:,'player_name'] = 'Curry'
df.head(2)

In [0]:
dfFixed.head(2)

## When the SettingWithCopyWarning isn’t given

In [0]:
df = shots.copy(deep=True)

In [0]:
dfSlice = df
dfSlice.loc[:,'player_name'] = 'Curry'
df.head(2)

In [0]:
dfSlice.head(2)

### How to fix this code

In [0]:
df = shots.copy(deep=True)

In [0]:
dfFixed = df.copy()
dfFixed.loc[:,'player_name'] = 'Curry'
df.head(2)

In [0]:
dfFixed.head(2)