# Classification of Movies 

Classifying the movie as animated or not animated on the basis of crew job titles.
This is the inspiration of this dataset (as described in the data description)

Firstly, preprocessed data and prepared it. Then used NLP and classification models for accomplishing the task.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Loading the Dataset as pandas.DataFrame

In [None]:
df_credits = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df_movies = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
df_credits.info()

In [None]:
df_movies.info()

In [None]:
df_credits.rename(columns = {'movie_id':'id'}, inplace = True)

The column 'title' and 'original_title' are equivalent
So dropping the 'original_title' column

In [None]:
df_movies.drop('original_title', axis = 1, inplace = True)

### Merging the Datasets

Merging the 2 datasets on the columns 'id' and 'title' as primary key 

In [None]:
df_merged = pd.merge(df_credits, df_movies, on = ['id','title'])

### Handling the Json Columns

Applying the literal_eval function of ast on all the json columns

In [None]:
from ast import literal_eval

json_cols = ['cast', 'crew', 'genres', 'keywords','production_companies', 'production_countries','spoken_languages']

for col in json_cols:
    df_merged[col] = df_merged[col].apply(literal_eval)

### Extracting the features from Json Columns

1. Genres list (from Genres column)
2. Jobs (from Crew column)
3. Percentage of voice artists among total cast (from cast column)

#### Helper Functions for the same

In [None]:
def get_genre(x):
    if(isinstance(x, list)):
        genre = [i['name'] for i in x]
    
    return genre

def get_jobs(x):
    if(isinstance(x, list)):
        jobs = [i['job'] for i in x]
    return jobs

def get_characternames(x):
    if(isinstance(x, list)):
        chr_name = [i['character'] for i in x]
        countc = 0
        for j in chr_name:
            if('(voice)' in j):
                countc += 1
        if(len(chr_name)!=0):
            return (countc/len(chr_name))
        else:
            return 0
        
def get_labels(x):
    if(len(x)==0):
        return np.nan
    elif('Animation' in x):
        return 1
    else:
        return 0

def get_costume_labels(x):
    if 'Costume Design' in x:
        return 1
    else:
        return 0
    
def get_genre_cd(x):
    if(isinstance(x, list)):
        dept = [i['department'] for i in x]
    if 'Lighting' in dept:
        return 0
    else:
        return 1

In [None]:
df_merged['genres'] = df_merged['genres'].apply(get_genre)
df_merged['crew_jobs'] = df_merged['crew'].apply(get_jobs)
df_merged['percent_of_voice_artists'] = df_merged['cast'].apply(get_characternames)
df_merged['labels'] = df_merged['genres'].apply(get_labels)
df_merged['costume'] = df_merged['crew_jobs'].apply(get_costume_labels)
df_merged['lighting_dept'] = df_merged['crew'].apply(get_genre_cd)

Rounding off the percentage to 3 decimal places

In [None]:
for x in range(0,len(df_merged['percent_of_voice_artists'])):
    df_merged['percent_of_voice_artists'][x] = np.round(df_merged['percent_of_voice_artists'][x],3)

Dropping the movies which are labelled as None

There are 28 such movies

In [None]:
df_merged.labels.isna().sum()

In [None]:
idxsc = df_merged[((df_merged.labels != 1) & (df_merged.labels != 0))].index
df_merged.drop(idxsc, inplace = True)
df_merged.reset_index(drop= True, inplace= True)

In [None]:
df_merged.isna().sum()

In [None]:
AnimatedMoviesCount = np.sum(df_merged['labels'] == 1)
NotAnimatedMoviesCount = np.sum(df_merged['labels'] == 0)

print("Number of Animated Movies are: ", AnimatedMoviesCount)
print("Number of Not Animated Movies are: ", NotAnimatedMoviesCount)

In [None]:
df_merged.costume.value_counts()

In [None]:
df_merged.lighting_dept.value_counts()

In [None]:
c = np.where(df_merged.labels==1)[0]
sum_budget = 0
for x in c:
    sum_budget += df_merged.budget[x]
avg_budget = sum_budget/len(c)
print("Average Budget of Animated Movie: ",str(avg_budget))

### Taking into account only those movies having atleast 7 crew members

So as to handle the quality of training data
Tested for multiple values, but 7 yielded best result

In [None]:
idx=[]
for x in range(0,df_merged.shape[0]):
    if len(df_merged.crew_jobs[x])>7:
        idx.append(x)
print("Number of Movies with more than 7 crew members: ",str(len(idx)))

df = df_merged.iloc[idx,:]

In [None]:
AnimatedMoviesCount2 = np.sum(df['labels'] == 1)
NotAnimatedMoviesCount2 = np.sum(df['labels'] == 0)

print("Number of Animated Movies are: ", AnimatedMoviesCount2)
print("Number of Not Animated Movies are: ", NotAnimatedMoviesCount2)

Converting 'crew_jobs' from list to string (in lower form) via join function

In [None]:
def join_strings(x):
    return ", ".join(x)

def str_lower(x):
    return x.lower()

df['crew_jobs'] = df['crew_jobs'].apply(join_strings)
df['crew_jobs'] = df['crew_jobs'].apply(str_lower)

In [None]:
df['labels'].value_counts()

### Model to classify Movie 

Classifying a movie as animated or not based on the crew job titles (using the data prepared above)

In [None]:
X1 = df['crew_jobs']
Y1 = df['labels']

In [None]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size=0.20, random_state=53)

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt 

def score_output(y_test, y_pred):
    
    cm = metrics.confusion_matrix(y_test, y_pred)
    clf_report = metrics.classification_report(y_test, y_pred)
    print(cm)
    print(clf_report)
    accuracy = accuracy_score(y_test, y_pred)
    print('The Accuracy on The Test Set is: %s' % accuracy)

    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, fmt='d', cmap = 'inferno'); #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Non-Animated', 'Animated']); ax.yaxis.set_ticklabels(['Non-Animated', 'Animated']);

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
from spacy.lang.en import STOP_WORDS
stop_words_str = " ".join(STOP_WORDS)
stop_words_lemma = set(word.lemma_ for word in nlp(stop_words_str))

additional_words = ['editor', 'director', 'producer', 'writer', 'assistant', 'sound']

for word in additional_words:
    stop_words_lemma = stop_words_lemma.union({word})

In [None]:
def lemmatizer(text):
     return [word.lemma_ for word in nlp(text)]

**Without Stop Words**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

bow = TfidfVectorizer(ngram_range = (1,1))

pipe = Pipeline([('bag_of_words', bow),('classifier', SVC())])
pipe.fit(X_train1,y_train1)

print("Without Stop Words")
print('Training accuracy: {}'.format(pipe.score(X_train1,y_train1)))
y_pred = pipe.predict(X_test1)
score_output(y_test1, y_pred)

**With Stop Words**

In [None]:
bow = TfidfVectorizer(ngram_range = (1,1), stop_words = stop_words_lemma)

pipe2 = Pipeline([('bag_of_words', bow),('classifier', SVC())])
pipe2.fit(X_train1,y_train1)


print("With Stop Words")
print('Training accuracy: {}'.format(pipe2.score(X_train1,y_train1)))
y_pred2 = pipe2.predict(X_test1)
score_output(y_test1, y_pred2)

It is evident from above results, SVM without stop words yields better resultd in terms of Recall, F1 score and Accuracy (Taking into count class imbalance and overfitting issues)

Thus, we can continue with SVM without stop words