In [3]:
import pandas as pd

# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 5 columns):
movie_id              1682 non-null int64
title                 1682 non-null object
release_date          1681 non-null object
video_release_date    0 non-null float64
imdb_url              1679 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 65.8+ KB


In [5]:
movies.dtypes

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object

In [6]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [7]:
print(users[['age', 'zip_code']].head())
print('\n')

# can also store in a variable to use later
columns_you_want = ['occupation', 'sex'] 
print(users[columns_you_want].head())

   age zip_code
0   24    85711
1   53    94043
2   23    32067
3   24    43537
4   33    15213


   occupation sex
0  technician   M
1       other   F
2      writer   M
3  technician   M
4       other   F


In [8]:
# users older than 25
print(users[users.age > 25].head(3))
print('\n')

# users aged 40 AND male
print(users[(users.age == 40) & (users.sex == 'M')].head(3))
print('\n')

# users younger than 30 OR female
print(users[(users.sex == 'F') | (users.age < 30)].head(3))           

   user_id  age sex occupation zip_code
1        2   53   F      other    94043
4        5   33   F      other    15213
5        6   42   M  executive    98101


     user_id  age sex  occupation zip_code
18        19   40   M   librarian    02138
82        83   40   M       other    44133
115      116   40   M  healthcare    97232


   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067


In [89]:
import pandas as pd
movielens = pd.merge(movies, ratings)

In [10]:
stats = movielens[['movie_id', 'title', 'rating']]
stats.head()

Unnamed: 0,movie_id,title,rating
0,1,Toy Story (1995),4
1,1,Toy Story (1995),5
2,1,Toy Story (1995),4
3,1,Toy Story (1995),4
4,1,Toy Story (1995),3


In [67]:
by_title = stats.groupby('title')
by_id = stats.groupby('movie_id')

by_id_mean = by_id['rating'].mean()
print by_id_mean.head()
print by_id.head()

movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64
       movie_id                                              title  rating
0             1                                   Toy Story (1995)       4
1             1                                   Toy Story (1995)       5
2             1                                   Toy Story (1995)       4
3             1                                   Toy Story (1995)       4
4             1                                   Toy Story (1995)       3
452           2                                   GoldenEye (1995)       3
453           2                                   GoldenEye (1995)       2
454           2                                   GoldenEye (1995)       4
455           2                                   GoldenEye (1995)       3
456           2                                   GoldenEye (1995)       4
583           3                                  Four Rooms (1995) 

In [12]:
print(by_title.count().head()) # NOT NULL records within each column
print('\n')
print(by_title.size().tail())

                           movie_id  rating
title                                      
'Til There Was You (1997)         9       9
1-900 (1994)                      5       5
101 Dalmatians (1996)           109     109
12 Angry Men (1957)             125     125
187 (1997)                       41      41


title
Young Guns II (1990)                     44
Young Poisoner's Handbook, The (1995)    41
Zeus and Roxanne (1997)                   6
unknown                                   9
Á köldum klaka (Cold Fever) (1994)        1
dtype: int64


In [13]:
print(by_title.sum()[20:25]) # total salaries of each department
print('\n')
print(by_title.mean()[20:25]) # average salary of each department
print('\n')
print(by_title.median()[20:25])

                                                    movie_id  rating
title                                                               
Addams Family Values (1993)                            33582     245
Addicted to Love (1997)                                28890     171
Addiction, The (1995)                                   8437      24
Adventures of Pinocchio, The (1996)                    41340     119
Adventures of Priscilla, Queen of the Desert, T...     42402     399


                                                    movie_id    rating
title                                                                 
Addams Family Values (1993)                            386.0  2.816092
Addicted to Love (1997)                                535.0  3.166667
Addiction, The (1995)                                  767.0  2.181818
Adventures of Pinocchio, The (1996)                   1060.0  3.051282
Adventures of Priscilla, Queen of the Desert, T...     382.0  3.594595


                

In [87]:
ratings_count = by_title.size()
ratings_count_250 = len(ratings_count.index[ratings_count>250])
print ratings_count_250

by_title_mean = by_title.mean()
by_title_mean.movie_id = by_title_mean.movie_id.astype(int)
by_title_mean_sorted = by_title_mean.sort_values('rating', ascending=True)
by_title_mean_sorted_250 = by_title_mean_sorted[:ratings_count_250]   

68


In [88]:
df = pd.DataFrame(by_title_mean_sorted_250, columns=['movie_id', 'rating'])
df.head()

Unnamed: 0_level_0,movie_id,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Eye of Vichy, The (Oeil de Vichy, L') (1993)",1562,1.0
Butterfly Kiss (1995),1621,1.0
Daens (1992),1565,1.0
JLG/JLG - autoportrait de décembre (1994),1366,1.0
Touki Bouki (Journey of the Hyena) (1973),1571,1.0


In [85]:
print by_title

<pandas.core.groupby.DataFrameGroupBy object at 0x1156d1450>


In [69]:
data = {'Open': [1, 2, 3],
       'Close': [4, 5, 6]}
df = pd.DataFrame(data)
print df

   Close  Open
0      4     1
1      5     2
2      6     3


In [70]:
df.apply(np.mean, axis=1)

0    2.5
1    3.5
2    4.5
dtype: float64

In [79]:
df.apply(np.mean, axis=0)

Close    5.0
Open     2.0
dtype: float64

In [83]:
df.apply(lambda y: (y[0] - y[1]), axis=0).head(3)

Close   -1
Open    -1
dtype: int64

In [78]:
#data = {by_title['movie_id'], by_title['title']}
#movie_df = pd.DataFrame(data)
by_title

<pandas.core.groupby.DataFrameGroupBy object at 0x1156d1450>

In [94]:
movielens = pd.merge(movies, ratings)
movielens_info = movies[['movie_id', 'title']]
movielens_rating = ratings[['movie_id', 'rating']].sort_values('movie_id', ascending=False)

In [95]:
print movielens_info.head()
print movielens_rating.head()

   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)
       movie_id  rating
25741         1       2
93639         1       4
55726         1       5
49529         1       4
89079         1       4
