# Imports

In [None]:
# Data prep
import re
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer, StandardScaler

# Models
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import balanced_accuracy_score, accuracy_score, plot_confusion_matrix, roc_auc_score

# Visualization
import matplotlib.pyplot as plt

# Data preprocessing

In [None]:
plt.style.use('ggplot')

MOVIES_FILEPATH = 'data/movies_raw.csv'

movie_d_types = {
    'year': object,
    'genre': object,
    'duration': int,
    'actors': object,
    'avg_vote': float,
    'votes': int,
    'reviews_from_users': float,
    'reviews_from_critics':float
}

df = pd.read_csv(MOVIES_FILEPATH, header=0, dtype=movie_d_types)

print(df.describe())

# Remove features which are not used in the model
removable_features = [
    'title', 
    'original_title', 
    'description',
    'usa_gross_income', 
    'worlwide_gross_income', 
    # Removed because of too many different currencies and a lot of null-values.
    # Could potentially give more precise predictions.
    'budget', 
    'metascore',
    'imdb_title_id', 
    'language',
    'director', 
    'writer', 
    'production_company'
]
df.drop(removable_features, axis=1, inplace=True)

# Make year numeric or NaN
df = df[pd.to_numeric(df['year'], errors='coerce').notnull()]

# Helps us ensure all columns are displayed https://stackoverflow.com/a/25415404
pd.set_option('display.expand_frame_repr', False)

# Get a summary of our dataset
pd.DataFrame.hist(df, figsize = [15,15]);

## Make USA_made

In [None]:
def usa_made_assign(value):
    if 'USA' in value.split(','):
        return 1
    return 0

# Remove the few movies where country is na
df = df[df['country'].notna()]
df['country'] = df.apply(lambda row: usa_made_assign(row['country']), axis=1)
df = df[pd.to_numeric(df['country'], errors='coerce').notnull()]
df.rename(columns={'country': 'usa_made'})

df.head(10)

## Make season categories from date_published

In [None]:
def season_assign(value):
    if re.match("\d{4}\-(0?[1-9]|1[012])\-(0?[1-9]|[12][0-9]|3[01])*", value):
        month = int(value.split('-')[1])
        # between May and August
        if 5 <= month <= 8:
            return 'season_summer'
        # November, December or January
        elif month >= 11 or month == 1:
            return 'season_winter'
    return 'season_outof'

# Generalize publish date to whether it is inside one of the large movie seasons, such as summer- or holiday-season.
df['date_published'] = df.apply(lambda row: season_assign(row['date_published']), axis=1)
df.head(1)

## make popular_actors from actors and actor_award_nominees.txt

In [None]:
# read txt file of nomineed oscar actors to set of actors
with open('generated/oscar/actor_award_nominees.txt') as f:
    award_nomineed_actors = set(line.strip() for line in f)

def has_popular_actor(actors):
    if len(list(actors & award_nomineed_actors)) > 0:
        return 1
    return 0 

for index, row in df.iterrows():
    # convert genre- and actor strings to genre- and actor lists
    row_genres = df.at[index, 'genre'].split(', ')
    df.at[index, 'genre'] = row_genres

    actors_set = set(str(row['actors']).split(', '))

    # Store popular actor binary value in new column
    df.at[index, 'popular_actor'] = has_popular_actor(actors_set)


# Actors no longer needed
df.drop(['actors'], axis=1, inplace=True)

df[['popular_actor']] = df[['popular_actor']].apply(pd.to_numeric)
df.head(20)

# Builds statistics regarding average movie rating between for movies with or without popular actors
actor_group = df[['popular_actor', 'avg_vote']].groupby('popular_actor').mean().reset_index()

# See the average rating of movies with a popular actor and those without.
print(actor_group.head())
actor_group.plot.bar(x='popular_actor', y='avg_vote', rot=0, color=['gray', '#E24A38'], figsize=(15,10))

## Target function

In [None]:

def target_func(value):
    if value >= 6:
        return 1
    return 0

df['target'] = df.apply(lambda row: target_func(row['avg_vote']), axis=1)

print(f'Percent of dataset in target: {round(df.avg_vote[df.avg_vote > 6.0].count()/df.avg_vote.count(),4)}')
# remove avg_vote. No longer needed when having target
df.drop(['avg_vote'], axis=1, inplace=True)

## One hot encode genre and season

In [None]:

# one-hot encode genre
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df['genre']),columns=mlb.classes_, index=df.index))

# one-hot encode seasons
lb = LabelBinarizer()
df = df.join(pd.DataFrame(lb.fit_transform(df["date_published"]),index=df.index, columns=lb.classes_))

#remove genre, date_published, and none-season after one-hot encoding it
df.drop(['genre','date_published'], axis=1, inplace=True)
df.head()

In [None]:
# 0 reviews are encoded as na. Convert to 0
df[['reviews_from_users', 'reviews_from_critics']] = df[['reviews_from_users','reviews_from_critics']].fillna(value=0)

In [None]:
df.isnull().sum(axis = 0)

## Make training and test data

In [None]:
# target
y = df.target

# Ensure target value is not part of X
df.drop(['target'], axis=1, inplace=True)
X = df

# Split into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(
    X,
    # y is the target value 
    y,
    # 25% of the X-entries was chosen for testing
    test_size=0.25, 
    # Seed used when shuffling data. Uses a constant value for reproducable shuffling
    random_state=42
)

## Scale data

In [None]:
scaler = StandardScaler()
# fit and transform training data
X_train_scaled = scaler.fit_transform(X_train)
# only transform the test data
X_test_scaled = scaler.transform(X_test)

## kNN Method

In [None]:
def knn_classify(n_neighbors, weights):
    knn = KNeighborsClassifier(
        n_jobs=-1,
        n_neighbors=n_neighbors,
        weights=weights)

    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    
    plot_confusion_matrix(knn, X_test_scaled, y_test,
            display_labels=['High rating','Low rating'],
            normalize='true', 
            cmap='Blues')

    return round(balanced_accuracy_score(y_test, y_pred),3)

### kNN with default parameters

In [None]:
default_balanced_accuracy_knn = knn_classify(5, 'uniform')
print(f'Balanced accuracy score: {default_balanced_accuracy_knn}')

### grid search kNN

In [None]:
knn_params = {
    # odd numbers between 1 and 100
   'n_neighbors': list(range(1,100,2)),
   'weights': ['uniform', 'distance']
}

knn = KNeighborsClassifier()
knn_grid_search_cv = GridSearchCV(knn, knn_params, verbose=2, cv=3, n_jobs=-1)
knn_grid_search_cv.fit(X_train_scaled, y_train)

In [None]:
tuned_parameters_knn = knn_grid_search_cv.best_params_

In [None]:
# Uncomment if you wish to run the tuned knn with hardcoded tuning parameters.
# Useful if you do not wish to run GridSearchCV again.
# tuned_parameters_knn = {'n_neighbors': 59, 'weights':'distance'}

### kNN with tuned parameters

In [None]:
tuned_balanced_accuracy_knn = knn_classify(tuned_parameters_knn['n_neighbors'], tuned_parameters_knn['weights'])

print(f'Balanced accuracy score: {tuned_balanced_accuracy_knn}')

## Decision Tree Method

In [None]:
def d_tree_classify(max_depth, min_samples_split, min_samples_leaf):
    if max_depth:
        d_tree = DecisionTreeClassifier(
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split)
    else:
        d_tree = DecisionTreeClassifier(
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split)

    d_tree = d_tree.fit(X_train, y_train)
    dt_y_pred = d_tree.predict(X_test)

    plot_confusion_matrix(d_tree, X_test, y_test, 
            display_labels=['High rating','Low rating'],
            normalize='true', 
            cmap='Blues')
    
    return balanced_accuracy_score(y_test, dt_y_pred)

### Decision tree with default *parameters*

In [None]:
default_balanced_accuracy_dt = d_tree_classify(None, 2, 1)
print(f'Balanced accuracy score: {default_balanced_accuracy_dt}')

### grid search Decision Tree

In [None]:
tree_params = {
   'max_depth': list(range(5,16)),
   'min_samples_split': list(range(1,41)),
   'min_samples_leaf': list(range(1,21))
}

grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), tree_params, verbose=2, cv=3, n_jobs=-1)
d_tree = grid_search_cv.fit(X_train, y_train)

In [None]:
tuned_parameters_dt = grid_search_cv.best_params_
# tuned_parameters_dt = {'max_depth': 11, 'min_samples_leaf': 16, 'min_samples_split':37}

### Decision tree with tuned *parameters*

In [None]:
tuned_balanced_accuracy_dt = d_tree_classify(**tuned_parameters_dt)
print(f'Balanced accuracy score: {tuned_balanced_accuracy_dt}')