## Task One
### Can we categorize we the films by type, such as animated or not by using the crew's job titles?

### Seperating the movies into all jobs found in animated movies and non animated movies and using logistic regression for modelling


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use(style="seaborn")
%matplotlib inline

In [None]:
df_movie = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_movies.csv")
df_credit = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_credits.csv")

In [None]:
def extract_feature(x, field='name'):
    """
    function is intended to convert string
    json and extract values of a specified field
    as a list...
    """
    x = ast.literal_eval(x)
    return [i[field].lower() for i in x]

def find_animation(x):
    '''
    Search for animation genre and label it as '1' for
    animation and '0' for non - animation
    '''
    for i in x:
        if i.lower() == 'animation':
            return 1
    return 0

# Preprocessing

In [None]:
df_movie.head()

In [None]:
#  checking length of both dataframes
len(df_movie),len(df_credit)

In [None]:
# Extracting genres
df_movie['list_genres'] = df_movie['genres'].apply(extract_feature)
df_movie['list_genres']

In [None]:
# Extracting animated movies
df_movie['animated'] = df_movie['list_genres'].apply(find_animation)

In [None]:
df_movie.head(4).T

In [None]:
# Selecting revelant columns
df_movie = df_movie[['id', 'original_title','list_genres','animated']]
df_movie.head()

In [None]:
df_credit.head()

In [None]:
# Extracting all job roles for each movie
df_credit['alljobs'] = df_credit['crew'].apply(extract_feature,field = 'job')
df_credit.head()

In [None]:
# Merge movie and credit dataframes
df = pd.merge(df_movie, df_credit,left_on='id', right_on='movie_id')
df.head()

In [None]:
# selecting useful features
df = df[['title','list_genres','animated','alljobs']]
df.head()

Below we can see the are 234 animated movies and 4569 non animated movies

In [None]:
df['animated'].value_counts()

In [None]:
# save df
# df.to_csv('taskone.csv',index=False)

In [None]:
# Total number of unique jobs across all movies
jobs = []
for i in df['alljobs']:
    for j in i:
        jobs.append(j.lower())
total_jobs = len(set(jobs))
print('There are',total_jobs,'different jobs in this dataset')
jobs = list(set(jobs))
jobs = [x for x in jobs]
mjobs =jobs
# print(jobs)

In [None]:
# Seperate animated movies and non-animated movies into different dataframes
animated= df.groupby('animated').get_group(1).reset_index(drop=True)
others = df.groupby('animated').get_group(0).reset_index(drop=True)

In [None]:
animated_copy = animated.copy()
non_animated_copy = others.copy()

In [None]:
# total list of crews job titles in animated movies
animated_jobs = []
for i in animated['alljobs']:
    for j in i:
        animated_jobs.append(j.lower())
total_ajobs = (len(set(animated_jobs)))
print('There are',total_ajobs,'job titles for animated movies')
ajobs = list(set(animated_jobs))

In [None]:
# total list of crews job titles in non-animated movies
other_jobs = []
for i in others['alljobs']:
    for j in i:
        other_jobs.append(j.lower())
total_ojobs = (len(set(other_jobs)))
print('There are',total_ojobs,'job titles for non-animated movies')
ojobs = list(set(other_jobs))

In [None]:
# jobs common to both animated movies and non animated movies
common_jobs = set.intersection(set(ojobs),set(ajobs))
total_comm_jobs  = len(set(common_jobs))
print('There are',total_comm_jobs,'job titles common to both animated and non_animated movies')
# print(common_jobs)

In [None]:
# Check for each of the 278 animated job titles in 'alljobs' column for each movie
job_count = []
animation_jobs = []
for k in animated_copy['alljobs']:
    k = list(set(k))
    ujobs= [j for j in ajobs if j in k]
    job_count.append(len(ujobs))
    animation_jobs.append(ujobs)
# print(k)
animated_copy['animation_jobs'] = animation_jobs
animated_copy['animation_job_count']= job_count
animated_copy.head()

In [None]:
# however note that some job titles in the 'animationjobs' column also exist in 'other_jobs'(non_animation)
len(set.intersection(set(ajobs),set(ojobs)))

In [None]:
com_df = df.copy()
# Check for each of these common job titles for each movie
job_count = []
com_jobs = []
for k in com_df['alljobs']:
    k = list(set(k))
    cjobs= [j for j in common_jobs if j  in k]
    job_count.append(len(cjobs))
    com_jobs.append(cjobs)
    
com_df['common_jobs'] = com_jobs
com_df['common_job_count']= job_count
com_df.head()

In [None]:
df['animated'].value_counts()

In [None]:
# Check for each of these unique non animation job titles for each movie
# Check for each of the 401 non animated job titles in 'alljobs' for each movie

job_count = []
non_animation_jobs = []
for k in non_animated_copy['alljobs']:
    k = list(set(k))
    ujobs= [j for j in ojobs if j in k]
    job_count.append(len(ujobs))
    non_animation_jobs.append(ujobs)
    
non_animated_copy['non_animation_jobs'] = non_animation_jobs
non_animated_copy['non_animation_job_count']= job_count
non_animated_copy.head()

In [None]:
sns.set(rc={'figure.figsize':(18.7,8.27)})
ax = sns.countplot(x = 'animation_job_count',
            data = animated_copy)
ax.set_title('number of animated movies vs number of animation jobs')
ax.set_ylabel('number of animated movies')

From the figure above, no animated movie made use of the 238 jobs titles, the max number used was 102. its obvious that most animated movies needs at least 102jobs.

# Modelling

In [None]:
# preview of concatenation - obviously, NAN values appear for non animation
# jobs column for animated movies and vice versa

adf = animated_copy[['title','animated','animation_jobs','animation_job_count']].head(3)
nadf = non_animated_copy[['title','animated','non_animation_jobs','non_animation_job_count']].head(3)
df = pd.concat([adf,nadf], keys=['title', 'animated', 'animation_jobs','non_animation_jobs'])
df.head(6)

In [None]:
adf = animated_copy[['title','animated','animation_jobs','animation_job_count']]
nadf = non_animated_copy[['title','animated','non_animation_jobs','non_animation_job_count']]
df = pd.concat([adf,nadf], keys=['title', 'animated', 'animation_jobs','non_animation_jobs'])
df

In [None]:
print(df['animated'].value_counts())

In [None]:
#one hot encode the job titles
df = df.fillna(0)

#For every row in the dataframe, iterate through the list of animation jobs and place a 1 into the corresponding column
try:
    for index, row in df.iterrows():
        for jobs in row['animation_jobs']:
            df.loc[index, jobs] = 1
    df = df.fillna(0)
except TypeError:
    print(index,jobs)    
df

In [None]:
# df.iloc[233]

In [None]:
#For every row in the dataframe, iterate through the list of non animation jobs and place a 1 into the corresponding column
try:
    for index, row in df.iloc[234:].iterrows():
        for jobs in row['non_animation_jobs']:
            df.loc[index, jobs] = 1
        df = df.fillna(0)
except TypeError:
    print(index,jobs)
    pass
    
df

In [None]:
train_df = df.iloc[:,6:].reset_index(drop=True)
train_df.head()

In [None]:
x = train_df
y = df[['animated']].reset_index(drop=True)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

x_train, x_test, y_train, y_test = train_test_split(x,np.ravel(y) , test_size=0.30, random_state=1)

lreg = LogisticRegression(C=10,max_iter = 2000)
clf=lreg.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy {:.2f}'.format(clf.score(x_test, y_test)))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred)) 

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

In [None]:
print(confusion_matrix)

In [None]:
# Predict new values, change all job titles to 1 and fill the remaining values with 0 to make it 417
predi = non_animated_copy.copy()
val = predi['alljobs'][2]
# val
testdf = pd.DataFrame()
for i in mjobs:
    testdf.loc[0,f'{i}'] = 0

for i in set(val):
    testdf.loc[0, f'{i}'] = 1
    
clf.predict(testdf)[0]

recall 0 is for non_animated and 1 is for animated movies

In [None]:
sns.set(rc={'figure.figsize':(10.7,6.27)})

clf_report = classification_report(y_test,
                                   y_pred,
                                   labels=[0,1],
                                   target_names=['non_animated','animated'],
                                   output_dict=True)

clf_plot =sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)
clf_plot.set_title('Logistic regression classification report')


In [None]:
clf_plot = clf_plot.get_figure()

clf_plot.savefig("log_reg_clf_rep.png")