# Import libraries and read data

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import scipy.stats

In [3]:
# Read data

data = pd.read_csv("oscar.csv")
print(data.shape)
display(data.head())

(348, 38)


Unnamed: 0,year_film,film,award_year,imdb_rating,rt_rating,rt_audience_score,rt_status,runtime_mins,content_rating,bafta_nom,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
0,1959,The Nun's Story,1960,7.5,94.0,82.0,fresh,149.0,NR,1,...,0,0,0,0,0,0,0,0,0,0
1,1959,Anatomy of a Murder,1960,8.0,100.0,90.0,certified-fresh,161.0,G,1,...,0,0,0,0,0,0,0,0,0,0
2,1959,Ben-Hur,1960,8.1,86.0,89.0,certified-fresh,212.0,G,1,...,0,0,0,0,0,0,1,0,0,0
3,1959,The Diary of Anne Frank,1960,7.4,79.0,77.0,fresh,180.0,PG,0,...,0,0,0,0,0,0,0,1,0,0
4,1959,Room at the Top,1960,7.5,100.0,82.0,fresh,115.0,NR,1,...,0,1,0,0,0,0,0,0,0,0


In [4]:
#data.dtype

In [4]:
# Change data types appropriately

data = data.astype(object)
data = data.astype({'Oscarstat_totalnoms':'float64', 'imdb_rating':'float64', 'rt_rating':'float64', 'rt_audience_score':'float64', 'runtime_mins':'float64'})
data.rename(columns={'Oscarstat_totalnoms': 'total_oscar_noms'}, inplace=True)

# Send and Retrieve data from SQL

In [6]:
import pymysql                       
from sqlalchemy import create_engine 
from getpass import getpass  
password = getpass()

········


In [11]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/sakila'
engine = create_engine(connection_string)

In [12]:
data.to_sql(name='oscars', con=engine, schema=None, if_exists='replace',index=False, index_label=None,
           chunksize=None, dtype=None, method=None)

348

In [13]:
sqldata = pd.read_sql_query('SELECT * FROM sakila.oscars', engine)
sqldata.head() 

Unnamed: 0,year_film,film,award_year,imdb_rating,rt_rating,rt_audience_score,rt_status,runtime_mins,content_rating,bafta_nom,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
0,1959,The Nun's Story,1960,7.5,94.0,82.0,fresh,149.0,NR,1,...,0,0,0,0,0,0,0,0,0,0
1,1959,Anatomy of a Murder,1960,8.0,100.0,90.0,certified-fresh,161.0,G,1,...,0,0,0,0,0,0,0,0,0,0
2,1959,Ben-Hur,1960,8.1,86.0,89.0,certified-fresh,212.0,G,1,...,0,0,0,0,0,0,1,0,0,0
3,1959,The Diary of Anne Frank,1960,7.4,79.0,77.0,fresh,180.0,PG,0,...,0,0,0,0,0,0,0,1,0,0
4,1959,Room at the Top,1960,7.5,100.0,82.0,fresh,115.0,NR,1,...,0,1,0,0,0,0,0,0,0,0


# Modeling

### Import sklearn train_test_split and separate the data

In [5]:
from sklearn.model_selection import train_test_split

# Get X and Y
X = data.drop(['oscar_winner', 'year_film', 'award_year', 'film'],axis=1)
y = data['oscar_winner']
y = y.astype('int')

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

### Separate X_train and X_test into numerical and categorical 

In [6]:
# Numericals split
X_train_num = X_train.select_dtypes(np.number)
X_test_num = X_test.select_dtypes(np.number)
display(X_train_num.head())

# Categoricals split
X_train_cat = X_train.select_dtypes(object)
X_test_cat = X_test.select_dtypes(object)
display(X_train_cat.head())
X_train_cat.shape

Unnamed: 0,imdb_rating,rt_rating,rt_audience_score,runtime_mins,total_oscar_noms
146,6.8,84.0,67.0,113.0,5.0
207,7.9,97.0,86.0,120.0,10.0
205,8.5,77.0,87.0,155.0,12.0
120,7.3,84.0,82.0,92.0,4.0
294,8.2,80.0,83.0,180.0,5.0


Unnamed: 0,rt_status,content_rating,bafta_nom,bafta_win,sag_nom,sag_win,gg_nom,gg_win,Nom_DGA,Win_DGA,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
146,certified-fresh,R,0,0,DE,DE,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
207,certified-fresh,PG-13,1,0,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
205,certified-fresh,R,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,1,0,0,0
120,fresh,PG,0,0,DE,DE,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
294,certified-fresh,R,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


(208, 29)

### Use X_train_num to fit scalers. Transform both X_train_num and X_test_num.

In [7]:
from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(X_train_num)
X_train_num_standardized = transformer.transform(X_train_num)
X_train_num_standardized = pd.DataFrame(X_train_num_standardized, columns=X_train_num.columns, index = X_train_num.index)
display(X_train_num_standardized.head())

X_test_num_standardized = transformer.transform(X_test_num)
X_test_num_standardized = pd.DataFrame(X_test_num_standardized, columns=X_test_num.columns, index = X_test_num.index)
display(X_test_num_standardized.head())


Unnamed: 0,imdb_rating,rt_rating,rt_audience_score,runtime_mins,total_oscar_noms
146,0.241379,0.774648,0.25,0.173228,0.307692
207,0.62069,0.957746,0.725,0.228346,0.692308
205,0.827586,0.676056,0.75,0.503937,0.846154
120,0.413793,0.774648,0.625,0.007874,0.230769
294,0.724138,0.71831,0.65,0.700787,0.307692


Unnamed: 0,imdb_rating,rt_rating,rt_audience_score,runtime_mins,total_oscar_noms
267,0.482759,0.901408,0.7,0.023622,0.307692
104,0.586207,0.830986,0.8,0.110236,0.615385
174,0.482759,0.887324,0.725,0.23622,0.538462
0,0.482759,0.915493,0.625,0.456693,0.538462
333,0.517241,0.859155,0.55,0.354331,0.461538


### Encode the categorical variables X_train_cat and X_test_cat 

In [8]:
hot_train_cat = X_train_cat[["rt_status","content_rating","sag_nom", "sag_win"]]
hot_test_cat = X_test_cat[["rt_status","content_rating","sag_nom", "sag_win"]]

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', handle_unknown='error').fit(hot_train_cat)
hot_train_encoded = encoder.transform(hot_train_cat).toarray()
hot_test_encoded = encoder.transform(hot_test_cat).toarray()

cols = encoder.get_feature_names_out(input_features=hot_train_cat.columns)
hot_train_cat = pd.DataFrame(hot_train_encoded, columns=cols, index = hot_train_cat.index)
hot_test_cat = pd.DataFrame(hot_test_encoded, columns=cols, index = hot_test_cat.index)

In [9]:
display(hot_train_cat.head())

Unnamed: 0,rt_status_fresh,rt_status_rotten,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,sag_nom_1,sag_nom_DE,sag_win_1,sag_win_DE
146,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
207,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
205,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
120,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
294,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [10]:
# No need to encode as data already in 1 and 0
other_train_cat = X_train_cat.drop(["rt_status","content_rating","sag_nom","sag_win"], axis=1).astype("float64").copy()
other_test_cat = X_test_cat.drop(["rt_status","content_rating","sag_nom","sag_win"], axis=1).astype("float64").copy()

In [11]:
display(other_train_cat.head())

Unnamed: 0,bafta_nom,bafta_win,gg_nom,gg_win,Nom_DGA,Win_DGA,Genre_action,Genre_biography,Genre_crime,Genre_comedy,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
146,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
205,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
120,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
294,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Concatenate all data

In [12]:
# Concatenate numericals with categoricals

X_train = pd.concat((X_train_num_standardized,hot_train_cat,other_train_cat),axis=1)
X_test = pd.concat((X_test_num_standardized,hot_test_cat,other_test_cat),axis=1)

display(X_train.head())

Unnamed: 0,imdb_rating,rt_rating,rt_audience_score,runtime_mins,total_oscar_noms,rt_status_fresh,rt_status_rotten,content_rating_NR,content_rating_PG,content_rating_PG-13,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
146,0.241379,0.774648,0.25,0.173228,0.307692,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207,0.62069,0.957746,0.725,0.228346,0.692308,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
205,0.827586,0.676056,0.75,0.503937,0.846154,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
120,0.413793,0.774648,0.625,0.007874,0.230769,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
294,0.724138,0.71831,0.65,0.700787,0.307692,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [14]:
# Model

LR = LogisticRegression(solver='saga',multi_class='multinomial',
                        max_iter=1000).fit(X_train, y_train) #fit model on training data

In [15]:
# Model Validation

predictions = LR.predict(X_test)
LR.score(X_test, y_test)

0.9357142857142857

In [16]:
# Confusion Matrix

confusion_matrix(y_test, predictions)

array([[116,   6],
       [  3,  15]])

In [17]:
# Calculating precision and recall

precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
 
print('Precision: ',precision)
print('Recall: ',recall)

Precision:  0.7142857142857143
Recall:  0.8333333333333334


# Bonus: Use model to predict 2023 oscars winner

In [18]:
# Read data

data2 = pd.read_csv("oscar2023.csv")
data2.head()

Unnamed: 0,year_film,film,award_year,imdb_rating,rt_rating,rt_audience_score,rt_status,runtime_mins,content_rating,bafta_nom,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
0,2022,Top Gun: Maverick,2023,8.3,96,99,certified-fresh,130,PG-13,0,...,0,0,0,0,0,0,0,0,0,0
1,2022,Triangle of Sadness,2023,7.4,71,68,fresh,147,R,0,...,0,0,0,0,0,0,0,0,0,0
2,2022,Elvis,2023,7.3,77,94,certified-fresh,160,PG-13,1,...,1,0,0,0,0,0,0,0,0,0
3,2022,Women Talking,2023,7.0,90,80,certified-fresh,104,PG-13,0,...,0,0,0,0,0,0,0,0,0,0
4,2022,Everything Everywhere All at Once,2023,7.9,94,86,certified-fresh,139,R,1,...,0,0,0,0,0,0,1,0,0,0


In [19]:
# Change data types appropriately

data2 = data2.astype(object)
data2 = data2.astype({'Oscarstat_totalnoms':'float64', 'imdb_rating':'float64', 'rt_rating':'float64', 'rt_audience_score':'float64', 'runtime_mins':'float64'})
data2.rename(columns={'Oscarstat_totalnoms': 'total_oscar_noms'}, inplace=True)

In [20]:
# Get X and Y

X = data2.drop(['oscar_winner', 'year_film', 'award_year', 'film'],axis=1)
y = data2['oscar_winner']
y = y.astype('int')

In [21]:
# Numericals split
X_num = X.select_dtypes(np.number)
display(X_num.head())
print(X_num.shape)

# Categoricals split
X_cat = X.select_dtypes(object)
display(X_cat.head())
print(X_cat.shape)

Unnamed: 0,imdb_rating,rt_rating,rt_audience_score,runtime_mins,total_oscar_noms
0,8.3,96.0,99.0,130.0,6.0
1,7.4,71.0,68.0,147.0,3.0
2,7.3,77.0,94.0,160.0,8.0
3,7.0,90.0,80.0,104.0,2.0
4,7.9,94.0,86.0,139.0,11.0


(10, 5)


Unnamed: 0,rt_status,content_rating,bafta_nom,bafta_win,sag_nom,sag_win,gg_nom,gg_win,Nom_DGA,Win_DGA,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
0,certified-fresh,PG-13,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,fresh,R,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,certified-fresh,PG-13,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,certified-fresh,PG-13,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,certified-fresh,R,1,0,1,1,1,0,1,1,...,0,0,0,0,0,0,1,0,0,0


(10, 29)


In [22]:
# Normalize numericals

X_normalized = transformer.transform(X_num)
X_normalized = pd.DataFrame(X_normalized,columns=X_num.columns)

In [23]:
# Encode categoricals

hot_cat = X_cat[["rt_status","content_rating","sag_nom","sag_win"]].astype(str)
hot_encoded = encoder.transform(hot_cat).toarray()
cols = encoder.get_feature_names_out(input_features=hot_cat.columns)
hot_cat = pd.DataFrame(hot_encoded, columns=cols, index = hot_cat.index)

#  No need to encode as data already in 1 and 0

other_cat = X_cat.drop(["rt_status","content_rating","sag_nom","sag_win"], axis=1).astype("float64").copy()

In [24]:
# Merge data

X = pd.concat([X_normalized,hot_cat,other_cat], axis=1)  
X.head()

Unnamed: 0,imdb_rating,rt_rating,rt_audience_score,runtime_mins,total_oscar_noms,rt_status_fresh,rt_status_rotten,content_rating_NR,content_rating_PG,content_rating_PG-13,...,Genre_music,Genre_romance,Genre_history,Genre_war,Genre_filmnoir,Genre_thriller,Genre_adventure,Genre_family,Genre_sport,Genre_western
0,0.758621,0.943662,1.05,0.307087,0.384615,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.448276,0.591549,0.275,0.440945,0.153846,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.413793,0.676056,0.925,0.543307,0.538462,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.310345,0.859155,0.575,0.102362,0.076923,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.62069,0.915493,0.725,0.377953,0.769231,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [25]:
prediction2023 = LR.predict(X)
LR.score(X, y)  #test the model on new data

1.0

In [26]:
confusion_matrix(y, prediction2023)

array([[9, 0],
       [0, 1]])

#### Our model predicted the correct 2023 winner!