In [1]:
import pandas as pd
import numpy as np
import re
import time
import bs4
import matplotlib.pyplot as plt
import seaborn as sns
import requests

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.user', 
    sep='|', names=u_cols)
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.data', 
    sep='\t', names=r_cols)

ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
m_cols = ['movie_id', 'title', 'release_date', 
            'video_release_date', 'imdb_url']

movies = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.item', 
    sep='|', names=m_cols, usecols=range(5), encoding='latin-1')

movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [5]:
print (movies.dtypes)
print ()
print (movies.describe())

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object

          movie_id  video_release_date
count  1682.000000                 0.0
mean    841.500000                 NaN
std     485.695893                 NaN
min       1.000000                 NaN
25%     421.250000                 NaN
50%     841.500000                 NaN
75%    1261.750000                 NaN
max    1682.000000                 NaN


In [6]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
users['occupation'].head()

0    technician
1         other
2        writer
3    technician
4         other
Name: occupation, dtype: object

In [8]:
columns_you_want = ['occupation', 'sex']
users[columns_you_want].head()

Unnamed: 0,occupation,sex
0,technician,M
1,other,F
2,writer,M
3,technician,M
4,other,F


In [9]:
print (users.iloc[3])

user_id                4
age                   24
sex                    M
occupation    technician
zip_code           43537
Name: 3, dtype: object


In [10]:
oldUsers = users[users['age']> 25] 
oldUsers.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
1,2,53,F,other,94043
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201


In [11]:
#users aged 40 and male
usersM = users[users['sex'] == 'M']
users40M = usersM[usersM['age'] == 40]
users40M.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
18,19,40,M,librarian,2138
82,83,40,M,other,44133
115,116,40,M,healthcare,97232
199,200,40,M,programmer,93402
283,284,40,M,executive,92629


In [12]:
#users female and programmers
usersF = users[(users['sex'] == 'F') & (users['occupation'] == 'programmer')]
print (usersF['age'].mean())

32.166666666666664


In [13]:
#split-apply-combine
print(ratings['user_id'].value_counts().head())
grouped_data = ratings['movie_id'].groupby(ratings['user_id'])
rating_per_user = grouped_data.count()
rating_per_user.head()

405    737
655    685
13     636
450    540
276    518
Name: user_id, dtype: int64


user_id
1    272
2     62
3     54
4     24
5    175
Name: movie_id, dtype: int64

In [14]:
#average rating per movie
group_movies = ratings['rating'].groupby(ratings['movie_id'])
average_rating = group_movies.mean()
average_rating.head()

movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64

In [15]:
movie_max_rating = average_rating.max()
movie_ids = average_rating[average_rating == movie_max_rating].index
movies[movies['movie_id'].isin(movie_ids)]['title']

813                         Great Day in Harlem, A (1994)
1121                       They Made Me a Criminal (1939)
1188                                   Prefontaine (1997)
1200           Marlene Dietrich: Shadow and Light (1996) 
1292                                      Star Kid (1997)
1466                 Saint of Fort Washington, The (1993)
1499                            Santa with Muscles (1996)
1535                                 Aiqing wansui (1994)
1598                        Someone Else's America (1995)
1652    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

In [16]:
how_many_ratings = group_movies.count()
print(how_many_ratings[average_rating==movie_max_rating])

movie_id
814     1
1122    1
1189    3
1201    1
1293    3
1467    2
1500    2
1536    1
1599    1
1653    1
Name: rating, dtype: int64


In [17]:
#average rating per user
group_users = ratings['rating'].groupby(ratings['user_id'])
aver_rate_per_user = group_users.mean()
aver_rate_per_user.head()

user_id
1    3.610294
2    3.709677
3    2.796296
4    4.333333
5    2.874286
Name: rating, dtype: float64

In [18]:
#list occupations by sex
group_occupations = users['sex'].groupby(users['occupation'])
occupation_by_sex = group_occupations.apply(lambda f: sum(f == 'M')> sum(f == 'F'))
occupation_by_sex.head()

occupation
administrator    True
artist           True
doctor           True
educator         True
engineer         True
Name: sex, dtype: bool

## Scrapping

In [19]:
# Fixed url for job postings containing data scientist
url = 'http://www.indeed.com/jobs?q=data+scientist&l='
# read the website
source = requests.get(url)
source.raise_for_status()
# parse html code
bs_tree = bs4.BeautifulSoup(source.text, "html.parser")

In [20]:

# see how many job postings we found
job_count_string = bs_tree.find(id = 'searchCount').contents[0]
job_count_string = job_count_string.split()[-2]
print("Search yielded %s hits." % (job_count_string))

# not that job_count so far is still a string, 
# not an integer, and the , separator prevents 
# us from just casting it to int

job_count_digits = [int(d) for d in job_count_string if d.isdigit()]
job_count = np.sum([digit*(10**exponent) for digit, exponent in 
                    zip(job_count_digits[::-1], range(len(job_count_digits)))])

print (job_count)

Search yielded 25,093 hits.
25093


In [21]:
# The website is only listing 10 results per page, 
# so we need to scrape them page after page
print (job_count)
num_pages = int(np.ceil(job_count/10))

base_url = 'http://www.indeed.com'
job_links = []
for i in range(5): #do range(num_pages) if you want them all
    if i%10==0:
        print (num_pages-i)
    url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)
    html_page = requests.get(url)
    bs_tree = bs4.BeautifulSoup(html_page.text, 'html.parser')
    job_link_area = bs_tree.find(id = 'resultsCol')
    job_postings = job_link_area.findAll("div")
    job_postings = [jp for jp in job_postings if not jp.get('class') is None 
                    and ''.join(jp.get('class')) =="rowresult"]
    job_ids = [jp.get('data-jk') for jp in job_postings]
    
    # go after each link
    for id in job_ids:
        job_links.append(base_url + '/rc/clk?jk=' + id)

    time.sleep(.5)

print ("We found a lot of jobs: ", len(job_links))

25093
2510
We found a lot of jobs:  45
