#### we will work on our MovieLens dataset using the "Pandas" package.
# Pandas makes working with Tabular data very easy as we will 

import pandas as pd

# read the movies.csv file and create a Pandas DataFrame called movies_df
movies_df = pd.read_csv('movies.csv')

In [1]:
# now lets peek into this data frame object using its head function
movies_df.head()

NameError: name 'movies_df' is not defined

In [None]:
# now lets see what is shape, i.e, number of rows and number of columns in the dataframe
# to do that we access the shape variable of the data frame object

movies_df.shape

In [None]:
# to access 
print(movies_df.iloc[0][0])
print(movies_df.iloc[0][1])
print(movies_df.iloc[0][2])

#### to access 
print(movies_df.iloc[0][0])
print(movies_df.iloc[0][1])
print(movies_df.iloc[0][2])

In [None]:
# now lets see how many movies are there in this data
len(movies_df.index)

In [None]:
# now lets find genre distribution of the movies we have

movies_df.head()

In [None]:
# the genres are pipe-character separated and it makes it hard for us find genre distribution

# lets add a new column called 'genres_arr' which has the genres represented as an array
movies_df['genres_arr'] = movies_df['genres'].str.split('|')

# now lets look at our dataframe to see the new column we added
movies_df.head()

In [None]:
# what if we only wanted to work on Animation movies
filter = movies_df.apply(lambda row: 'Animation' in row['genres_arr'], axis='columns')
filter.head()

In [None]:
animation_df = movies_df[filter]
animation_df.head()
print(len(animation_df.index))

In [None]:
# now lets count the frequency of each genre across all our movies

genre_counter = {}
for index, row in movies_df.iterrows():
    for genre in row['genres_arr']:
        genre_counter[genre] = genre_counter.get(genre, 0) + 1

print(genre_counter)

In [None]:
# now let's plot the genres distribution as a bar chart and visualize it

# introducing the matplotlib package which lets us plot charts, graphs in Python
%matplotlib inline
import matplotlib.pyplot as plt

# now lets plot this genre distribution as a pie chart
plt.pie(genre_counter.values(), labels=genre_counter.keys())
plt.title('Genre distribution')
plt.show()

In [None]:
# we can also plot a bar chart (with grid lines and slanted x axis labels for better readability)
x = list(range(len(genre_counter)))
plt.xticks(x, genre_counter.keys(), rotation=80)
plt.bar(x, genre_counter.values())
plt.title('Genre distribution as bar chart')
plt.plot()

In [None]:
# Lets find masala movies to watch, as we all love masala :)
# Real masala movies has Action and Romance and Comedy and Thriller :)
masala_genres = ['Romance','Comedy', 'Action', 'Thriller']
masala_lambda = lambda x: set(masala_genres).issubset(x['genres_arr'])
masala_movies = movies_df[movies_df.apply(masala_lambda, axis='columns')]
masala_movies

In [None]:
# now let's do the 5 point summary of ratings attribute

# read the ratings.csv file and create a Pandas DataFrame called movies_df
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
ratings_df.head()

In [None]:
# lets find out all values ratings users can give to movies
unique_ratings = ratings_df['rating'].unique()
unique_ratings.sort()
print(unique_ratings)

In [None]:
# now lets find out number of ratings and number of users who have given those ratings

print('#ratings %d' % len(ratings_df.index))
print('#users %d' % len(ratings_df['userId'].unique()))

In [None]:
# introducing the numpy package which is used to find various stats out of data
import numpy as np

# 5 point summary of ratings
print('min: %.1f' % np.min(ratings_df.rating))
print('25 percentile: %.1f' % np.percentile(ratings_df.rating, 25))
print('median: %.1f' % np.median(ratings_df['rating']))
print('75 percentile: %.1f' % np.percentile(ratings_df.rating, 75))
print('max: %.1f' % np.max(ratings_df.rating))

In [None]:
# now lets see how many movies have got each of these ratings

ratings_df['rating'].value_counts()

In [None]:
# now lets plot a histogram of movie ratings to get an overall picture
plt.hist(ratings_df.rating)
plt.xticks([0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
plt.xlabel('Rating')
plt.ylabel('# of movies')
plt.grid()
plt.show()

In [None]:
# now lets find the highest rated movies (movies with highest average rating)
ratings_df.head()


In [None]:
# create a dataframe with just movieId and rating attributes
ratings_view = ratings_df[['movieId', 'rating']]
ratings_view.groupby(['movieId'], as_index=False).mean().sort_values(by='rating', ascending=False).head(10)

#### but which movies are these, we want their names as well

# now lets merge/join the movies_df and ratings_df so that we can see the actual movie titles of top 10 movies
merged_df = pd.merge(ratings_df, movies_df, on='movieId')
merged_df.head()

In [None]:
# create a dataframe with just movieId and rating attributes
ratings_view = merged_df[['movieId', 'rating', 'title']]
ratings_view.groupby(['movieId', 'title'], as_index=False).mean().sort_values(by='rating', ascending=False).head(10)

In [None]:
# but these are surely not the top rated movies we would expect there must be a problem

# lets check how many ratings have these movies received, lets take an example of movieId 163949
len(merged_df[merged_df['movieId'] == 163949].index)

#### now lets only consider movies which have atleast 100 ratings and see how the top 10 movies change
temp_df = ratings_view.groupby(['movieId', 'title'], as_index=False).count()
well_rated_df = temp_df[temp_df.apply(lambda x: x['rating'] > 100, axis='columns')]

well_rated_df.head()

In [None]:
final_df = pd.merge(merged_df, well_rated_df, how='inner', on=['movieId'])
final_df.head()

In [None]:
ratings_view = final_df[['movieId', 'rating_x', 'title_x']]
x = ratings_view.groupby(['movieId', 'title_x'], as_index=False).mean().sort_values(by='rating_x', ascending=False)
x.head(10)