In [None]:
import numpy as np # linear algebra

import pandas as pd
# pandas defaults
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

import os

# 1. Reading the Data

In [None]:
df = pd.read_csv("../input/IMDB-Movie-Data.csv")

# 2. Data Snapshot

In [None]:
# top 5 rows
df.head()

In [None]:
# top 50 rows
df.head(50)

In [None]:
# last 5 rows
df.tail()

In [None]:
# last 50 rows
df.tail(50)

In [None]:
# To get statistics of numerical columns
df.describe()

In [None]:
# To get maximum value of a column. When you take a single column you can think of it as a list and apply functions you would apply to a list
max(df['Rating'])

In [None]:
# no of rows in dataframe
len(df)

In [None]:
# Shape of Dataframe
df.shape

## 3. Handling Columns

### a. Getting Column Names in a list

In [None]:
columnnames = df.columns
print(columnnames)

### b. Specifying user-defined Column Names

In [None]:
df.columns = ['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime_Minutes', 'Rating', 'Votes', 'Revenue_Millions',
       'Metascore']

### another way

In [None]:
df.rename(columns = {'Revenue_Millions':'Rev_M','Runtime_Minutes':'Runtime_min'},inplace=True)

In [None]:
df.head()

### d. Subsetting specific columns

In [None]:
df = df[['Rank', 'Title', 'Genre', 'Year','Runtime_min', 'Rating', 'Votes', 'Rev_M', 'Metascore']]

In [None]:
df.head()

### e. Seeing column types:

In [None]:
df.dtypes

## 4. Apply and Lambda

#### a. Creating a Column

You can create a new column in many ways.
If you want a column that is a sum or difference of columns, you can pretty much use simple basic arithmetic. Here I get the average rating based on IMDB and Normalized Metascore.

In [None]:
df['AvgRating'] = (df['Rating'] + df['Metascore']/10)/2

But sometimes we may need to build complex logic around the creation of new columns.
To give you a convoluted example, let's say that we want to build a custom movie score based on a variety of factors.

Say, If the movie is of the thriller genre, I want to add 1 to the IMDB rating subject to the condition that IMDB rating remains less than or equal to 10. And If a movie is a comedy I want to subtract 1 from the rating.

How do we do that?
Whenever I get a hold of such complex problems, I use apply/lambda. Let me first show you how I will do this.

In [None]:
def custom_rating(genre,rating):
    if 'Thriller' in genre:
        return min(10,rating+1)
    elif 'Comedy' in genre:
        return max(0,rating-1)
    else:
        return rating
        
df['CustomRating'] = df.apply(lambda x: custom_rating(x['Genre'],x['Rating']),axis=1)

The general structure is:
- You define a function that will take the column values you want to play with to come up with your logic. Here the only two columns we end up using are genre and rating.
- You use an apply function with lambda along the row with axis=1. The general syntax is:

```df.apply(lambda x: func(x['col1'],x['col2']),axis=1)```

You should be able to create pretty much any logic using apply/lambda since you just have to worry about the custom function.

#### b. Filtering a dataframe

Pandas make filtering and subsetting dataframes pretty easy. You can filter and subset dataframes using normal operators and &,|,~ operators.

In [None]:
# Single condition: dataframe with all movies rated greater than 8
df_gt_8 = df[df['Rating']>8]

df_gt_8.head()

In [None]:
# Multiple conditions: AND - dataframe with all movies rated greater than 8 and having more than 100000 votes

And_df = df[(df['Rating']>8) & (df['Votes']>100000)]

And_df.head()

In [None]:
# Multiple conditions: OR - dataframe with all movies rated greater than 8 or having a metascore more than 90

Or_df = df[(df['Rating']>8) | (df['Metascore']>80)]
Or_df.head()


In [None]:
# Multiple conditions: NOT - dataframe with all emovies rated greater than 8 or having a metascore more than 90 have to be excluded

Not_df = df[~((df['Rating']>8) | (df['Metascore']>80))]
Not_df.head()

Pretty simple stuff. 

But sometimes we may need to do complex filtering operations.

And sometimes we need to do some operations which we won't be able to do using just the above format.

For instance: Let us say we want to filter those rows where the number of words in the movie title is greater than or equal to than 4.
How would you do it? 

Trying the below will give you an error. Apparently, you cannot do anything as simple as split with a series.

In [None]:
new_df = df[len(df['Title'].split(" "))>=4]


One way is to first create a column which contains no of words in the title using apply and then filter on that column.

In [None]:
#create a new column
df['num_words_title'] = df.apply(lambda x : len(x['Title'].split(" ")),axis=1)
#simple filter on new column
new_df = df[df['num_words_title']>=4]
new_df.head()

And that is a perfectly fine way as long as you don't have to create a lot of columns. But, I prefer this:

In [None]:
new_df = df[df.apply(lambda x : len(x['Title'].split(" "))>=4,axis=1)]
new_df.head()

What I did here is that my apply function returns a boolean which can be used to filter.

Now once you understand that you just have to create a column of booleans to filter, you can use any function/logic in your apply statement to get however complex a logic you want to build.

Let us see another example. I will try to do something a little complex to just show the structure.

We want to find movies for which the revenue is less than the average revenue for that particular year?

In [None]:
year_revenue_dict = df.groupby(['Year']).agg({'Rev_M':np.mean}).to_dict()['Rev_M']
def bool_provider(revenue, year):
    return revenue<year_revenue_dict[year]
    
new_df = df[df.apply(lambda x : bool_provider(x['Rev_M'],x['Year']),axis=1)]

new_df.head()

We have a function here which we can use to write any logic. 
That provides a lot of power for advanced filtering as long as we can play with simple variables.

####  c. Change Column Types

I even use apply to change the column types since I don't want to remember the syntax for changing column type and also since it lets me do much more complex things. 
The normal syntax to change column type is astype in Pandas. So if I had a column named price in my data in an str format. I could do this:

```df['Price'] = newDf['Price'].astype('int')```

But sometimes it won't work as expected. 
You might get the error: ValueError: invalid literal for long() with base 10: '13,000'. That is you cannot cast a string with "," to an int. To do that we first have to get rid of the comma. 
After facing this problem time and again, I have stopped using astype altogether now and just use apply to change column types.

```df['Price'] = df.apply(lambda x: int(x['Price'].replace(',', '')),axis=1)```

#### d. And lastly there is progress_apply

progress_apply is a single function that comes with tqdm package. 

And this has saved me a lot of time.

Sometimes when you have got a lot of rows in your data, or you end up writing a pretty complex apply function, you will see that apply might take a lot of time.

I have seen apply taking hours when working with Spacy. In such cases, you might like to see the progress bar with apply. 

You can use tqdm for that.

After the initial imports at the top of your notebook, just replace apply with progress_apply and everything remains the same.

In [None]:
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

new_df['rating_custom'] = df.progress_apply(lambda x: custom_rating(x['Genre'],x['Rating']),axis=1)

In [None]:
new_df.head()

In [None]:
df.head()

## Groupby

In [None]:
# Find out the sum of votes and revenue by year
import numpy as np
df.groupby(['Year']).aggregate({'Votes':np.sum, 'Rev_M':np.sum}).reset_index()

In [None]:
# Multiple column groupby

In [None]:
df.groupby(['Year','Genre']).aggregate({'Votes':np.sum, 'Rev_M':np.sum}).reset_index().head()

# concat and Merge

In [None]:
movies_2006 = df[df['Year']==2006]
movies_2007 = df[df['Year']==2007]

In [None]:
movies_06_07 = pd.concat([movies_2006,movies_2007])

In [None]:
#merge

In [None]:
rating_dataframe = df[['Title','Rating']]
votes_dataframe =  df[['Title','Votes']]

In [None]:
rating_dataframe.head()

In [None]:
votes_dataframe.head()

In [None]:
rating_vote_df = pd.merge(rating_dataframe,votes_dataframe,on='Title',how='left')
rating_vote_df.head()

# melt

In [None]:
df.head()

In [None]:
genre_set = set()
for genre in df['Genre'].unique():
    for g in genre.split(","):
        genre_set.add(g)
for genre in genre_set:
    df[genre] = df['Genre'].apply(lambda x: 1 if genre in x else 0)

working_df = df[['Title','Rating', 'Votes',
       'Rev_M']+list(genre_set)]

working_df.head()

In [None]:
reshaped_df = pd.melt(working_df,id_vars = ['Title','Rating','Votes','Rev_M'],value_vars = list(genre_set),var_name = 'Genre', value_name ='Flag')

reshaped_df.head()

In [None]:
reshaped_df  = reshaped_df[reshaped_df['Flag']==1]
reshaped_df.head()

In [None]:
re_reshaped_df = reshaped_df.pivot_table(index=['Title','Rating','Votes','Rev_M'], columns='Genre', 
                    values='Flag', aggfunc='sum').reset_index()

re_reshaped_df.head()

In [None]:
re_reshaped_df=re_reshaped_df.fillna(0)

re_reshaped_df.head()