In [2]:
# to scrape data from wikipedia, we need install the package called lxml
# we can do that here from our notebook or, if we think we will use it often, we could modify our "install_packages"
# shell script to install it automatically each time we start a job in UCloud
%pip install lxml

# import pandas so we can put data in a nice dataframe
# we'll abbreviate pandas as pd, because that's what everybody does

import pandas as pd

Note: you may need to restart the kernel to use updated packages.


## Scraping data from the web
Using `pandas.read_html`, we can read data from websites where data is presented in a table-like format. Wikipedia has lots of these, and is a great source for data to play with. Below, we'll look at data from the [List of Sesame Street Muppets](https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets)

In [3]:
# scrape table data from websites

rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets")
df = rawdata[1]
df

Unnamed: 0,Character,Actor/Muppet performer,Description,Unnamed: 3
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...,
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn...",
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br...",
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri...",
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...,Writer Christopher Finch called Anything Muppe...
...,...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i...",
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ...",
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally...",
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...,


In [None]:
list(df)

## Removing an unwanted column
Below are several ways to get rid of the final column (there are more ways to do this!) If you find the "axis = 1" part in the first method confusing, well, I do too, and [we are not alone](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)! My advice for now is to just accept it and move on. Probably the easiest method is to use `pop` (method 3)

In [4]:

# method 1
df = df.drop(['Unnamed: 3'], axis = 1)
df 

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [5]:

# method 2
df = df.loc[:,list(df)[0:3]] 
df

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [6]:
# method 3
df.pop('Unnamed: 3')
df

KeyError: 'Unnamed: 3'

In [7]:
# take a column from a dataframe and assign it to a list variable
a = list(df['Character'])

In [8]:
# find the first four items in the list
a[:4]

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford']

In [9]:
# find the last three items in the list
a[-3:]

['The Two-Headed Monster', 'Wes', 'Zoe']

In [10]:
# find items in the middle of the list
a[5:11]

['AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]']

In [11]:
# select the first 20 items from a list
b = a[:20]
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Buster']

In [12]:
# remove the last item in a list
b.pop()
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno']

In [None]:
# inpsect the list to make sure the last item was removed


In [13]:
# remove a specific item from the list
b.pop(7)
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno']

In [14]:
# stick a Kermit on the end of the list
b.append('Kermit')
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [None]:
# inspect the list to make sure Kermit was added


In [17]:
# insert an item into a list at a particular position
b.insert(5,'Miss Piggy')
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'Miss Piggy',
 'Miss Piggy',
 'Miss Piggy',
 'AM Monsters',
 'Aristotle',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [19]:
# replace an item in a list
b[5] = 'Kermit'
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'Kermit',
 'Miss Piggy',
 'Miss Piggy',
 'AM Monsters',
 'Aristotle',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [21]:
# make a new list which adds "is a cute monster" to each item in the list
d = [x + ' is a cute monster' for x in b]
d

['Abby Cadabby is a cute monster',
 'Alice Snuffleupagus is a cute monster',
 'Alistair Cookie is a cute monster',
 'The Amazing Mumford is a cute monster',
 'Anything Muppets is a cute monster',
 'Kermit is a cute monster',
 'Miss Piggy is a cute monster',
 'Miss Piggy is a cute monster',
 'AM Monsters is a cute monster',
 'Aristotle is a cute monster',
 'Baby Bear is a cute monster',
 'Barkley is a cute monster',
 'Beautiful Day Monster[broken anchor] is a cute monster',
 'Bennett Snerf is a cute monster',
 'Benny is a cute monster',
 'Bert is a cute monster',
 'Betty Lou is a cute monster',
 'Biff is a cute monster',
 'Big Bird is a cute monster',
 'Bip Bippadotta is a cute monster',
 'Bruno is a cute monster',
 'Kermit is a cute monster']

In [None]:
# reset list b to original first 20 items from list a


In [None]:
# make a list of your favorite monsters, and then make a new list which only includes the monsters 
# from list b that are also in your favorites list


In [None]:
# make a new list which includes the monsters from list b that are not in your favorites list




## More fun with lists

In [None]:
# add 10 to each number in d


In [None]:
# divide each number in d by 2


## Dataframe manipulation

In [None]:
# download the student sleep data
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/ExPsyLing/master/Data/StudentSleep.csv")
df

In [None]:
# find the number of rows and columns in the dataframe


In [None]:
# make a new dataframe df1 which only includes the first 4 rows of the original dataframe


In [None]:
# make another new dataframe df2 which only includes rows 5 through the end of the original dataframe


In [None]:
# make a third dataframe df3 with df2 on top of df1 (hint: use pd.concat)


In [None]:
# overwrite df3 with df1 and df2 back in their original order


In [None]:
# make a new column called "average" which is the mean of the other columns for each row
df3['average'] = df3.mean(axis = 1)
df3

In [None]:
# remove the average column from the dataframe
df3.pop('average')
df3

In [None]:
# make a list of the means of each column in the dataframe
column_means = list(df3.mean())
column_means

In [None]:
# make a list of the means of each row in the dataframe
colnames = list(df3)
colnames

In [None]:
list(zip(colnames, column_means))

In [None]:
student_means = dict(zip(colnames, column_means))
student_means

In [None]:
print('Student 4\'s average:', round(student_means['Student 4'], 3))

In [None]:
# make a dataframe the mean hours of sleep for each student
df_means = pd.DataFrame(zip(colnames, column_means))
df_means.columns = ['Students', 'Sleep Hours']
df_means

In [None]:
# transpose the dataframe
df_transposed = df3.transpose()
df_transposed

In [None]:
colnames = list(df_transposed)
colnames

In [None]:
newcols = ['Day ' + str(x+1) for x in colnames]
newcols

In [None]:
df_transposed.columns = newcols
df_transposed

In [None]:
df_transposed.index.name = 'student'
df_transposed

In [None]:
df_transposed.reset_index(inplace = True)
df_transposed

In [None]:
df_long = pd.melt(df_transposed, id_vars = 'student')
df_long