In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import matplotlib.pyplot as plt



import nltk
import string
from wordcloud import WordCloud

from nltk.tokenize import RegexpTokenizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from collections import defaultdict

from mlxtend.plotting import plot_confusion_matrix

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Data

In [None]:
movie_df = pd.read_csv('../input/tmdb-movies-dataset/tmdb_movies_data.csv')
wiki_df = pd.read_csv('../input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv')

movie_df.columns = movie_df.columns.str.lower()
wiki_df.columns = wiki_df.columns.str.lower()

## PreProcessing

In [None]:
movie_df['original_title'] = movie_df['original_title'].str.lower()
wiki_df['title'] = wiki_df['title'].str.lower()
cols_to_use = movie_df.columns.difference(wiki_df.columns)

wiki_df['plot'] = wiki_df['plot'].str.lower()

In [None]:
df = pd.merge(wiki_df, movie_df[cols_to_use],  how='inner', left_on=['title','release year'], right_on=['original_title','release_year'])
df.drop_duplicates(inplace=True)

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
df['genre'].value_counts()[:10]

In [None]:
#Make new binning columns
for a,b in df.iterrows():
    if b['genre'] == 'comedy' or b['genre'] == 'Comedy':
        df.at[a, 'genre_new'] = 1
        df.at[a, 'genre_cat'] = 'Comedy'
    elif b['genre'] == 'drama' or b['genre'] == 'Drama':
        df.at[a, 'genre_new'] = 2
        df.at[a, 'genre_cat'] = 'Drama'
    elif b['genre'] == 'horror' or b['genre'] == 'Horror':
        df.at[a, 'genre_new'] = 3
        df.at[a, 'genre_cat'] = 'Horror'
    elif b['genre'] == 'thriller' or b['genre'] == 'Thriller':
        df.at[a, 'genre_new'] = 4
        df.at[a, 'genre_cat'] = 'Thriller'
    elif b['genre'] == 'action' or b['genre'] == 'Action':
        df.at[a, 'genre_new'] = 5
        df.at[a, 'genre_cat'] = 'Action'
    elif b['genre'] == 'adventure' or b['genre'] == 'Adventure':
        df.at[a, 'genre_new'] = 6
        df.at[a, 'genre_cat'] = 'Adventure'
    else:
        df.at[a, 'genre_new'] = 0
        df.at[a, 'genre_cat'] = 'Other'

In [None]:
df['genre_cat'].value_counts()[:10]

# EDA

## Univariate Analysis

In [None]:
#Movie Rating Distribution
plt.hist(df.vote_average)
plt.xlabel('Movie Rating')
plt.ylabel('Count')
plt.title('Movie Rating Distribution')
plt.show()

#Movie Adjusted Revenue Distribution
plt.hist(df.revenue_adj,bins = 3)
plt.xlabel('Movie Adjusted Revenue')
plt.ylabel('Count')
plt.title('Movie Adjusted Revenue Distribution')
plt.show()

#Movie Adjusted Revenue Distribution
plt.hist(df.runtime,bins = 3)
plt.xlabel('Movie Runtime - Minutes')
plt.ylabel('Count')
plt.title('Movie Runtime Distribution')
plt.show()

## Bivariate Analysis

In [None]:
#Create Box Plot
fig1 = df.boxplot(column=['budget_adj'],by ='genre_cat', rot = 45, figsize = (10,5))
#Create Title
plt.title('Distribution of Budget by Genre')
plt.suptitle('')
plt.ylabel('Adjusted Budget')
plt.xlabel('Genre')
#Show Plot
plt.show()

#Create Box Plot
fig1 = df.boxplot(column=['revenue_adj'],by ='genre_cat', rot = 45, figsize = (10,5))
#Create Title
plt.title('Distribution of Revenue by Genre')
plt.suptitle('')
plt.ylabel('Adjusted Revenue')
plt.xlabel('Genre')
#Show Plot
plt.show()

In [None]:
#Movie vs Revenue - Postive Relationship
plt.plot(df.budget_adj, df.revenue_adj, 'o', markersize =1, alpha = 0.5)
m, b = np.polyfit(df.budget_adj, df.revenue_adj, 1)
plt.plot(df.budget_adj, m*df.budget_adj + b, 'red', linewidth = 0.5)
plt.xlabel('Movie Adjusted Budget')
plt.ylabel('Movie Adjusted Revenue')
plt.title('Revenue vs Budget')
plt.show()

#Movie vs Revenue - Weak Relationship
plt.plot(df.budget_adj, df.vote_average, 'o', markersize =1, alpha = 0.5)
m, b = np.polyfit(df.budget_adj, df.vote_average, 1)
plt.plot(df.budget_adj, m*df.budget_adj + b, 'red', linewidth = 0.5)
plt.xlabel('Movie Adjusted Revenue')
plt.ylabel('Movie Votes Avg')
plt.title('Revenue vs Budget')
plt.show()

#Movie vs Revenue - Outliers Contribute
plt.plot(df.runtime, df.vote_average, 'o', markersize =1, alpha = 0.5)
m, b = np.polyfit(df.runtime, df.vote_average, 1)
plt.plot(df.runtime, m*df.runtime + b, 'red', linewidth = 0.5)
plt.xlabel('Movie runtime')
plt.ylabel('Movie Votes Avg')
plt.title('Movie Rating vs Runtime')
plt.show()

## Text Analysis

In [None]:
#String for WordCloud
list123 = ''
#Iterate through training values
for a in wiki_df['plot']:
    list123 += a

In [None]:
nltk.download("stopwords")
nltk.download("punkt")
#Create Stop Words Corpus
stop_words = nltk.corpus.stopwords.words("english") + list(string.punctuation) \
+ list(string.ascii_lowercase) + list(string.ascii_uppercase) + list(string.digits) \
+ list(['--']+["''"]+["``"]+[".."]+["..."]+["ii"]+["iii"]+["iv"]+["'s"]+["the"]+["however"] \
+ ["when"]+["as"]+["meanwhile"]+['eventually'])

In [None]:
#Tokenize Words
HR1_token = nltk.word_tokenize(list123)
#Remove Stops words
hr1_filter = [w for w in HR1_token if not w in stop_words]
#Count Words
hr1_counter = Counter(hr1_filter)
#Sort Words by counts
sorted_word_counts = sorted(list(hr1_counter.values()), reverse=True)

In [None]:
#10 Most Common Words in the Movie Plot dataset
hr1_counter.most_common(10)

In [None]:
#Initiate Word Lemmatizer
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
#Get Lemmatized Word
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in hr1_filter]
#Create Counter
bow = Counter(lemmatized)
#Print 10 most common words
print(bow.most_common(10))

In [None]:
nouns = {}
cccc = hr1_counter.most_common()
for a in cccc:
    pos_sentences = nltk.word_tokenize(a[0])
    tag = nltk.pos_tag(pos_sentences)
    tag = tag[0]
    if tag[1] == 'NN':
        nouns[a[0]] = a[1]

In [None]:
nouns1 = Counter(nouns)
#Print 10 most common words
print(nouns1.most_common(30))

### Word Cloud without Stop Words of Movie Plots

In [None]:
#Set Width & Height
width = 12
height = 12
#Initialize Plot
plt.figure(figsize=(width, height))
#Generate Wordplot
wordcloud = WordCloud(width=1800,height=1400).generate(str(hr1_filter))
#Show Plot
plt.imshow(wordcloud)
plt.axis("off")
#Generate Plot
plt.show()

### Word Cloud without Stop Words of Lemmatized words in Movie Plots

In [None]:
#Set Width & Height
width = 12
height = 12
#Initialize Plot
plt.figure(figsize=(width, height))
#Generate Wordplot
wordcloud = WordCloud(width=1800,height=1400).generate(str(lemmatized))
#Show Plot
plt.imshow(wordcloud)
plt.axis("off")
#Generate Plot
plt.show()

### Word Cloud of Nouns in Movie Plots

In [None]:
#Set Width & Height
width = 12
height = 12
#Initialize Plot
plt.figure(figsize=(width, height))
#Generate Wordplot
wordcloud = WordCloud(width=1800,height=1400,stopwords = stop_words).generate(str(nouns1))
#Show Plot
plt.imshow(wordcloud)
plt.axis("off")
#Generate Plot
plt.show()

# Predictive Models

In [None]:
#New DF with orginal Wiki Data
df2aa = wiki_df
#Make new binning columns
for a,b in df2aa.iterrows():
    if b['genre'] == 'comedy' or b['genre'] == 'Comedy':
        df2aa.at[a, 'genre_new'] = 1
        df2aa.at[a, 'genre_cat'] = 'Comedy'
    elif b['genre'] == 'drama' or b['genre'] == 'Drama':
        df2aa.at[a, 'genre_new'] = 2
        df2aa.at[a, 'genre_cat'] = 'Drama'
    elif b['genre'] == 'horror' or b['genre'] == 'Horror':
        df2aa.at[a, 'genre_new'] = 3
        df2aa.at[a, 'genre_cat'] = 'Horror'
    elif b['genre'] == 'thriller' or b['genre'] == 'Thriller':
        df2aa.at[a, 'genre_new'] = 4
        df2aa.at[a, 'genre_cat'] = 'Thriller'
    elif b['genre'] == 'action' or b['genre'] == 'Action':
        df2aa.at[a, 'genre_new'] = 5
        df2aa.at[a, 'genre_cat'] = 'Action'
    elif b['genre'] == 'adventure' or b['genre'] == 'Adventure':
        df2aa.at[a, 'genre_new'] = 6
        df2aa.at[a, 'genre_cat'] = 'Adventure'
    else:
        df2aa.at[a, 'genre_new'] = 0
        df2aa.at[a, 'genre_cat'] = 'Other'
#Make new binning columns
for a,b in df2aa.iterrows():
    if b['genre'] == 'comedy' or b['genre'] == 'Comedy':
        df2aa.at[a, 'Comedy'] = 1
    else:
        df2aa.at[a, 'Comedy'] = 0
#Assign X-Data
X3 = df2aa['plot']
#Assign Target Data
y3 = df2aa['genre_new']
y2 = df2aa['Comedy']

## Predict if movie plot is comedy of not

## Naives Bayes Model

In [None]:
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X3, y2 , test_size = 0.33, random_state = 53)
# Create and fit & Transform the Vectorizer
count_vectorizer2 = CountVectorizer(stop_words = 'english')
count_train3 = count_vectorizer2.fit_transform(X_train.values) 
count_test3 = count_vectorizer2.transform(X_test.values)

In [None]:
# Create and fit the model
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train3, y_train)
# Predict on the test features, print the results
pred1 = nb_classifier.predict(count_test3)
# Print the accuracy
metrics.accuracy_score(y_test, pred1)

In [None]:
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, pred1))
print("\n\n Classification Report:\n")
print(metrics.classification_report(y_test, pred1))

In [None]:
classes = ['Not Comedy', 'Comedy']
cm = metrics.confusion_matrix(y_test, pred1)

figure, ax = plot_confusion_matrix(conf_mat = cm,
                                   class_names = classes,
                                   show_absolute = False,
                                   show_normed = True,
                                   colorbar = True)

plt.show()

## Predict if movie plot is Comedy, Drama, Horror, Thriller, Action, Adventure, or other.

## Regression Model

In [None]:
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X3, y3 , test_size = 0.33, random_state = 53)
# Create and fit & Transform the Vectorizer
count_vectorizer2 = CountVectorizer(stop_words = 'english')
count_train3 = count_vectorizer2.fit_transform(X_train.values) 
count_test3 = count_vectorizer2.transform(X_test.values)

In [None]:
# Create and fit the model
linreg = LinearRegression().fit(count_train3,y_train)
# Predict on the test features, print the results
pred3 = linreg.predict(count_test3)

# The mean squared error
print('Mean squared error: %.4f'
      % metrics.mean_squared_error(pred3,y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.4f'
      % metrics.r2_score(pred3, y_test))

## Naives Bayes Model

In [None]:
# Create and fit the model
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train3, y_train)
# Predict on the test features, print the results
pred2 = nb_classifier.predict(count_test3)
metrics.accuracy_score(y_test, pred2)

In [None]:
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, pred2))
print("\n\n Classification Report:\n")
print(metrics.classification_report(y_test, pred2))

In [None]:
classes = ['Other','Comedy','Drama','Horror','Thriller','Action','Adventure']
cm = metrics.confusion_matrix(y_test, pred2)

figure, ax = plot_confusion_matrix(conf_mat = cm,
                                   class_names = classes,
                                   show_absolute = False,
                                   show_normed = True,
                                   colorbar = True)

plt.show()

## Recommendation System

In [None]:
#Remove NaNs from OverView Column
movie_df = movie_df.dropna(subset = ['overview'])
#Create A title Columns
movie_df['title'] = movie_df['original_title']
wiki_df2 = wiki_df[wiki_df['origin/ethnicity'] == 'American']

In [None]:
#Create TFIDF Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [None]:
#Fit & Transform on movie overview corpus
tfidf_matrix2 = tfidf.fit_transform(movie_df['overview'])
#Create a Cosine Similarity Matrix on corpus
cosine_sim2 = metrics.pairwise.linear_kernel(tfidf_matrix2, tfidf_matrix2)

In [None]:
#Create TFIDF Vectorizer on Wikepdia Movie Plot corpus
tfidf_matrix1 = tfidf.fit_transform(wiki_df2['plot'][:10000])
cosine_sim = metrics.pairwise.linear_kernel(tfidf_matrix1, tfidf_matrix1)

In [None]:
# Generate mapping between titles and index
indices1 = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

def get_recommendations2(title, cosine_sim, indices = indices1):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return movie_df['title'].iloc[movie_indices]

# Generate mapping between titles and index
indices2 = pd.Series(wiki_df2.index, index=wiki_df2['title']).drop_duplicates()

def get_recommendations(title, cosine_sim, indices = indices2):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return wiki_df2['title'].iloc[movie_indices]

In [None]:
movie_df[['overview','title']]

In [None]:
print('Top 10 similar Movies')
get_recommendations2('jurassic world',cosine_sim2, indices1)