In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import seaborn as sns 
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [None]:
# FLOW:
'''
1. Multi label classification problem
2. Issue of multiple genres for a movie can be resolved using binary relevance (one hot encoding the possible target values)
3. The usual data preprocessing of text data will be required here. 
4. Binary relevance of the genres is done using MultiLabelBinarizer
5. Now we get the features from our summaries (vectorization, word embedding like word2vec, glove, or elmo)
6. Since the summaries are cleaned, for the above method, we can cut down the total number of words to use based on the frequency. 
7. Also, before feature extraction, we split the data
8. The total number of distinct genres we have now as our target will determine the number of models we have. 
9. This multiclass problem will be solved using the OvR strategy. 
10. After fitting binary classifiers like Logreg, SVM, perception on the one hot encoded data, we will get the predictions. 
11. Using inverse transform, we will get the actual genres back. 
12. Then we find out the f1 score on the predicted data and actual data. 
13. The probabilities generated were calculated on threshold which we can calculate using k fold cv. This might even improve the f1 score.
'''

In [None]:
# IDEAS:
'''
1. Co occurence matrix of all the genres (by a heatmap, atleast of the most frequent genres that can be visualized together)
2. We can also use a slightly customized VGGNet on movie posters of the movies we have here, pedict their genres, and compare the results
'''

In [None]:
# First it will be purely NLP based genre prediction in which we will not use any feature other than the ones derives using textual data. 

In [None]:
movies.columns

In [None]:
# since this is a genre prediction task, we will need all the text data so let's clean all of that. Would genre depend on the country of production as well? And such data?
# first, homepage, id, original_language, release_date, runtime, status, vote_count won't be needed 

# some new features that can be derived: whether a movie has a homepage or not, date could be separated into month, year, week, the runtime could 
# be binned, the original language could be replaced with frequency counts. This only in the case if we want to maybe predict the movie revenue?

In [None]:
# Let's automate all the text cleaning steps and see what we get for the provided data
def cleanit(x):
    x = re.sub('[^a-zA-Z#]', ' ', x)
    x = x.lower()
    x = list(set([x for x in x.split() if x not in stopwords.words("english")]))
    wordnet = WordNetLemmatizer()
    x = [wordnet.lemmatize(x) for x in x]
    x = [w for w in x if len(w)>3]
    return x

In [None]:
movies = movies[~movies.overview.isnull()]

In [None]:
movies['overview'] = pd.Series([cleanit(w) for w in movies['overview']])

In [None]:
# GENRES:

movies.genres.isnull().sum()/len(movies.index)
# since there are missing values in this feature which is our target, we will create a new dataframe first to split the movies dataframe
data = movies[['id', 'title', 'overview', 'genres']]
data = data.dropna()
data['genres'] = pd.Series([re.findall('"name": "(\w+)"}', w) for w in data.genres])
data = data.dropna()
# an idea to wonder about: dictionary of genres might work somewhere?
# would there be any point to doing visual analysis by deriving a new column 'number_of_genres'

In [None]:
# let's create a list of unique genre values 
allgenres = sum(data.genres, [])
len(set(allgenres))
# just total 18 genres. Let's create a frequency distribution

In [None]:
allgenres = nltk.FreqDist(allgenres)

allgenresdf = pd.DataFrame({'Genre': list(allgenres.keys()), 
                              'Count': list(allgenres.values())})

g = allgenresdf.nlargest(columns="Count", n = 50) 
plt.figure(figsize=(12,15)) 
ax = sns.barplot(data=g, x= "Count", y = "Genre") 
ax.set(ylabel = 'Count') 
plt.show()

In [None]:
# now we create a function which will visualize the most frequent words in the given overview 
alltext = sum(data.overview, [])
fdist = nltk.FreqDist(alltext)
words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 

d = words_df.nlargest(columns="count", n = 30) 
  
# visualize words and frequencies
plt.figure(figsize=(12,15)) 
ax = sns.barplot(data=d, x= "count", y = "word") 
ax.set(ylabel = 'Word') 
plt.show()

In [None]:
# let's one hot encode the target variable now 
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit(data.genres)
y = mlb.transform(data.genres)

In [None]:
X_train = pd.Series([" ".join(x) for x in X_train])
X_train[6]

In [None]:
from sklearn.model_selection import train_test_split
tfidf = TfidfVectorizer(max_df=0.8, max_features=5000)
X_train, X_test, y_train, y_test = train_test_split(data['overview'], y, test_size=0.2, random_state=9)
X_train_tfidf = tfidf.fit_transform(pd.Series([" ".join(x) for x in X_train]))
X_test_tfidf = tfidf.transform(pd.Series([" ".join(x) for x in X_test]))

In [None]:
len(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [None]:
ovr.fit(X_train_tfidf, y_train)
y_pred = ovr.predict(X_test_tfidf)

In [None]:
mlb.inverse_transform(y_pred)

In [None]:
f1_score(y_test, y_pred, average="micro")
# f1 score is not good. So we can try to change the default threshold value of 0.5 and see if the f1 score gets improved. 
# Recall, f1_score is the harmonic mean of recall and precision. The formula does not matter as much as the meaning does, which is simply that
# f1_score is a combined metric: it is used when neither precision nor recall is favored over the other, so we need both. 
# Why is neither precision nor recall is more important over the other here? Because here, the classes are more than 2, and there is no positve or negative 
# class; there are just many labels we have to assign to the movies. This is what I think. You?

In [None]:
y_pred_prob = ovr.predict_proba(X_test_tfidf)

In [None]:
y_pred_new = (y_pred_prob >= 0.25).astype(int)

In [None]:
f1_score(y_test, y_pred_new, average="micro")
# 0.56 is a good jump. 

In [None]:
# let's see how we can use KFold CV 
# INCOMPLETE

In [None]:
# NEXT WE COULD TRY: add other non textual features along with the overview to predict the genres, genre prediction using the movie posters,
# revenue prediction as well 

In [None]:
# we create a new column stating whether a movie has a homepage or not, and 
# then compare it with its revenue being high or not 
movies['homepage'] = [1 if len(str(w))>3 else 0 for w in movies['homepage']]

In [None]:
# TAGLINE 

movies.tagline
# already clean, but has missing values. Do we drop those rows? Depends on the total features we end up with after the data preprocssing. 

In [None]:
# PRODUCTION COUNTRIES:

movies['production_countries'] = movies['production_countries'].fillna(str(movies['production_countries'].mode()[0]))

movies['number_of_production_countries'] = pd.Series([len(w.split('name'))-1 for w in movies.production_countries])
movies['production_countries'] = pd.Series([re.findall("'name': '([\w ]+)'}", w) for w in movies.production_countries])
movies['production_countries'] = pd.Series([fillblank(w) for w in movies['production_countries']])

def ifUSA(lis):
    for w in lis:
        if w == 'United States of America':
            return 1
        else:
            return 0
        
movies['production_countries'] = pd.Series([fillblank(w) for w in movies['production_countries']])
movies['USA_Producing'] = pd.Series([ifUSA(w) for w in movies['production_countries']])

In [None]:
# PRODUCTION COMPANIES:

movies['production_companies'] = movies['production_companies'].fillna(str(movies['production_companies'].mode()[0]))

movies['production_companies'] = pd.Series([re.findall("'name': '([\w ]+)',", w) for w in movies.production_companies])
movies['#production_companies'] = pd.Series([len(w) for w in movies['production_companies']])
movies['production_companies'] = pd.Series([fillblank(w) for w in movies['production_companies']])

In [None]:
# GENRES:

movies['genres'] = movies['genres'].fillna(str(movies['genres'].mode()[0]))
movies['genres'] = pd.Series([re.findall('"name": "(\w+)"}', w) for w in movies.genres])
# genres are not arranged in alphbetical order either, so we can discard everything 
# but the primary genre 
movies['genres'] = pd.Series([fillblank(w) for w in movies['genres']])

# an idea to wonder about: dictionary of genres might work somewhere?
# would there be any point to doing visual analysis by deriving a new column 'number_of_genres'

In [None]:
# ORIGINAL TITLE

# what is the different between title and original_title?
round(len([x for x in movies['title']==movies['original_title'] if x is True])/len(movies.index)*100, 2)
# 88% of the times the original_title matches the title. Even if we have to drop one
# what feature do we drop? can we, say, merge? 
movies[['title', 'original_title']]
# we stick with the title 
movies = movies.drop('original_title', axis = 1)

In [None]:
# OVERVIEW

movies.overview
#already clean. Can only have stopwwords remove now, be tokenized and be lemmatized 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def word_dist(w):
    w = str(w)
    lis = set(word_tokenize(w.lower()))-set(stopwords.words('english'))
    li = []
    for i in lis:
        if len(i)>2:    
            li.append(i)
    return " ".join(li)

movies['overview'] = pd.Series([word_dist(w) for w in movies.overview])

In [None]:
# POPULARITY
# popularity can be rounded to 2 decimal places
movies['popularity'] = round(movies['popularity']*100, 2)
movies['popularity'] = movies['popularity'].astype(int)
# just not sure if this feature will be used or not 

In [None]:
# RELEASE DATE

movies['release_date'] = pd.to_datetime(movies['release_date'])
movies['release_date'] = movies['release_date'].fillna(movies['release_date'].mode())
movies['release_month'] = pd.Series([pd.to_datetime(w) for w in movies.release_date]).dt.month
movies = movies.drop('release_date', axis = 1)

quart = []
for i in movies['release_month']:
    if i <4:
        quart.append(1)
    elif i<7 and i>3:
        quart.append(2)
    elif i<10 and i>6:
        quart.append(3)
    else:
        quart.append(4)
movies['release_month_quarter'] = pd.Series(quart)
movies = movies.drop('release_month', axis = 1)

In [None]:
# ORIGINAL LANGUAGE:

# need to check the variations in original_language
round(movies['original_language'].value_counts()/len(movies.index)*100, 2)
# 85% + movies are of the english origin. We could either remove this feature or 
# rather derive a simple new feature where value is 1 if original language is 
# English else 0
revenue_percentage_m_train = movies[['original_language', 'revenue']].groupby('original_language').sum()/sum(movies.revenue)
round(revenue_percentage_m_train*100, 2).sort_values('revenue', ascending = False)
# 96% of the revenue is generated from movies with their official_language as 'en'
# we can either drop the other languages or just drop the whole feature itself
movies = movies.drop(['original_language'], axis = 1)

In [None]:
# SPOKEN LANGUAGES:

movies['spoken_languages'] = movies['spoken_languages'].fillna(str(movies['spoken_languages'].mode()))
movies['number_of_spoken_languages'] = pd.Series([len(w.split('{'))-1 for w in movies.spoken_languages])
movies['spoken_languages'] = pd.Series([re.findall("'name': '(\w+)'", w) for w in movies.spoken_languages])
def has_english(lis):
    for w in lis:
        if w=='English':
            return 1
        else:
            return 0

movies['spoken_languages'] = [fillblank(w) for w in movies['spoken_languages']]
movies['has_english'] = pd.Series([has_english(w) for w in movies.spoken_languages])

In [None]:
# now we can divide the datasets into the following specificities at a later time: 
'''
1. Dataset with features that will help us make a predictive model for revenue prediction: mpm
2. A complete textual dataset to apply NLP on to predict the genres: mgp
3. A dataset where we import data from external source to get poster path so we can do a CNN genre prediction as well: mgpp
4. A recommender: mr (will be using all kinds of filtering: demographic, content based and collaborative)
'''

In [None]:
import copy 
movies.columns
mpm = copy.deepcopy(movies)
mgp = copy.deepcopy(movies)
mgpp = copy.deepcopy(movies)
mr = copy.deepcopy(movies)

In [None]:
# First we will focus on the mgp for genre prediction using NLP