# Chapter 7: Data wrangling


## Dataframes


In [2]:
# dataaaa
age = [17, 19, 21, 37, 18, 19, 47, 18, 19]
score = [12, 10, 11, 15, 16, 14, 25, 21, 29]
rt = [3.552, 1.624, 6.431, 7.132, 2.925, 4.662, 3.634, 3.635, 5.234]
group = ["test", "test", "test", "test", "test", "control", "control", "control", "control"]

# make into dataframe
import pandas as pd

df = pd.DataFrame(
    {'age': age,
     'score': score,
     'rt': rt,
     'group': group
    })
df

Unnamed: 0,age,score,rt,group
0,17,12,3.552,test
1,19,10,1.624,test
2,21,11,6.431,test
3,37,15,7.132,test
4,18,16,2.925,test
5,19,14,4.662,control
6,47,25,3.634,control
7,18,21,3.635,control
8,19,29,5.234,control


In [3]:
# pull out a stored variable
score_data = df['score']
score_data

0    12
1    10
2    11
3    15
4    16
5    14
6    25
7    21
8    29
Name: score, dtype: int64

In [4]:
# only pull out data from 4 first participants
score_data = df['score'][0:4]
score_data

0    12
1    10
2    11
3    15
Name: score, dtype: int64

In [5]:
# extrating data from row instead of column
score_data = df.loc[2] # use the loc attribute of a pandas dataframe
score_data

age         21
score       11
rt       6.431
group     test
Name: 2, dtype: object

In [9]:
# get raw data out of a pandas series
my_row = list(df.loc[2])
my_column = list(df['score'])
print(my_row)
print(my_column)

[np.int64(21), np.int64(11), np.float64(6.431), 'test']
[12, 10, 11, 15, 16, 14, 25, 21, 29]


In [16]:
# smart tips

# see column variables
list(df)

# check how many columns and rows we have
df.shape

# see first data in df
df.head(10) # default is 5, otherwise specify

# see last data in df
df.head(10) # default is 5, otherwise specify

# get all data out of df and into a list
df.values.tolist()

[[17, 12, 3.552, 'test'],
 [19, 10, 1.624, 'test'],
 [21, 11, 6.431, 'test'],
 [37, 15, 7.132, 'test'],
 [18, 16, 2.925, 'test'],
 [19, 14, 4.662, 'control'],
 [47, 25, 3.634, 'control'],
 [18, 21, 3.635, 'control'],
 [19, 29, 5.234, 'control']]

## Tabulating and cross-tabulating data
### cross-tabulation = the construction of frequency tables

In [17]:
# make df
data = {'speaker':["upsy-daisy",  "upsy-daisy",  "upsy-daisy",  "upsy-daisy",  "tombliboo",   "tombliboo",   "makka-pakka", "makka-pakka",
  "makka-pakka", "makka-pakka"],
       'utterance':["pip", "pip", "onk", "onk", "ee",  "oo",  "pip", "pip", "onk", "onk"]}

df = pd.DataFrame(data, columns=['speaker','utterance'])

# cross-tabulate
pd.crosstab(index = df["speaker"], columns = "count")

col_0,count
speaker,Unnamed: 1_level_1
makka-pakka,4
tombliboo,2
upsy-daisy,4


In [18]:
# instead we want to cross-tabulate the speakers with the utterances
pd.crosstab(index=df["speaker"], columns=df["utterance"],margins=True)

utterance,ee,onk,oo,pip,All
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
makka-pakka,0,2,0,2,4
tombliboo,1,0,1,0,2
upsy-daisy,0,2,0,2,4
All,1,4,1,4,10


## Transforming or recoding a variable
## Useful mathematical functions
## Slicing and dicing
## Extracting a subset of a dataframe
## Sorting, flipping, or merging datasets
## Reshaping a dataset