# ESRB Rating Classification

Coded by Luna McBride

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt #Plotting
%matplotlib inline

plt.rcParams["figure.figsize"] = (10,10) #Make the plots bigger by default
plt.rcParams["lines.linewidth"] = 2 #Setting the default line width
plt.style.use("ggplot") #Define the style of the plot

from sklearn.model_selection import train_test_split #Split the data into train and test
from sklearn.ensemble import RandomForestClassifier #Forest for prediction and regression
from sklearn.metrics import mean_squared_error #Error testing
from sklearn.metrics import classification_report #Report of Classification

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
esrbTrain = pd.read_csv("../input/video-games-rating-by-esrb/Video_games_esrb_rating.csv") #Load the training data
esrbTest = pd.read_csv("../input/video-games-rating-by-esrb/test_esrb.csv") #Load the testing data

esrbTrain.head() #Take a peek at the training data

---

# Check for Null Values

In [None]:
print(esrbTrain.isnull().any()) #Check for null values in the training set

In [None]:
print(esrbTest.isnull().any()) #Check for null values in the testing set

There are no null values in either dataset.

---

# Data Exploration

In [None]:
colTrain = esrbTrain.columns #Load the training set's columns
plt.rcParams["figure.figsize"] = (10,4) #Change the plot size
plt.rcParams.update({'figure.max_open_warning': 0}) #Stop the warning from appearing, since there are 20+ columns

#For each column, plot the values for a quick visualization
for col in colTrain:
    #Ignore the title column
    if col != "title":
        plt.figure() #Reload the figure to have items on separate plots
        esrbTrain[col].value_counts().plot.bar(title = col + " train") #Plot the columns

This dataset is heavily weighted toward more age appropriate descriptors (or rather no violence, no blood, etc). This could make the model have more difficulty in making predictions.

In [None]:
colTest = esrbTest.columns #Load the testing set's columns

#For each column, plot the values for a quick visualization
for col in colTest:
    #Ignore the title column
    if col != "title":
        plt.figure() #Reload the figure to have items on separate plots
        esrbTest[col].value_counts().plot.bar(title = col + " test") #Plot the columns

The test set looks to have similar proportions, to the training set, but there is a heavy skew toward blood and violence. This may give some difficulty to a classification algorithm. Everything else seems like it will be fine.

---

# Prepare the Data for Classification

## Create a Validation Set

In [None]:
ratings = esrbTrain["esrb_rating"].copy() #Get the ratings into their own variable
rates = pd.get_dummies(ratings) #Encode the ESRB ratings
print(rates[:5]) #Show how the data is encoded

#Drop ratings and titles from the characteristics. We are testing on rating and the title only adds processing time and dummies
characteristics = esrbTrain.drop(columns = {"title", "esrb_rating"}).copy()
trainChara, valChara, trainRating, valRating = train_test_split(characteristics, rates, test_size = 0.25) #Split out a validation set

## Set Up the Test Set

In [None]:
testRate = esrbTest["esrb_rating"].copy() #Get the ratings
testTitles = esrbTest["title"] #Extract the titles for later
testChara = esrbTest.drop(columns = {"title", "esrb_rating"}).copy() #Drop the title from the test set for consistency
testChara.head() #Take a peek at the data

---

# Build the Model

Build and test based on the validation set. The test data will be used for classification and not verification in this case.

In [None]:
#GetChara: Get the most important characteristics to the classification
#Input: the characteristics, the model, what the dataset pertains to
#Output: None
def getChara(characteristics, forest, subject):
    attributes = characteristics.columns #Get the tested attributes
    attributes = list(zip(attributes, forest.feature_importances_)) #Zip the attributes together with their coefficient
    sortAtt = sorted(attributes, key = lambda x: x[1], reverse = True) #Sort the zipped attributes by their coefficients

    print("According to the Random Forest, the most important factors for {} are: ".format(subject)) #Start printing the most important labels
    i = 0 #Counter variable so only the top five are printed

    #For each attribute in the sorted attributes
    for label, coef in sortAtt:
        if i < 7: #If there has not been five printed yet
            print(label) #Print the label as an important factor
        i += 1 #Increase i by 1
        
#ReportClassification: analyze accuracy based on the validation set
#Input: The model, the validation metrics, and the labels
#Output: Classification metrics
def reportClassification(forest, charaVal, rateVal, labels):
    predict = forest.predict(charaVal) #Create predictions off the validation set
    accuracy = forest.score(charaVal, rateVal) #Get the accuracy of the validation
    error = np.sqrt(mean_squared_error(rateVal, predict)) #Get the root mean square error
    report = classification_report(rateVal, predict, target_names = labels) #Get the classification report
    
    return accuracy, error, report #Return the classification metrics
        
#BuildModel: Build a random forest model based on the split values
#Input: The train_test_split training results
#Output: the trained model
def buildModel(charaTrain, rateTrain):
    forest = RandomForestClassifier(n_estimators = 100) #Initialize the forest model
    forest.fit(charaTrain, rateTrain) #Train the model
    return forest #Return the model

In [None]:
model = buildModel(trainChara, trainRating) #Build the forest model

In [None]:
labels = ratings.unique() #Get the ESRB ratings

acc, err, rep = reportClassification(model, valChara, valRating, labels) #Get the classification reports
print("Accuracy: {}".format(acc)) #Print the accuracy
print("Root Mean Square Error: {}".format(err)) #Print the error
print("Classification Report\n: {}".format(rep)) #Print the classification report

I tested the validation with various split sizes. The accuracy was highest with a lower validation split, but higher splits lead to a higher recall. The recall is more important in a case like this, so I will stick with a 25% validation split. 

E and M are the best accounted for despite their smaller sizes, meaning they are likely more clearly defined when it comes to characteristics to make a game E or M. E10+ and T must has more nuance and human error, hence the lower rates despite their high counts.

In [None]:
getChara(characteristics, model, "ESRB Rating") #Get the most important characteristics

It appears violence, language, blood, and themes are the most important attributes in classifying ESRB Ratings. No descriptors means that a game did not get any of the other descriptors, meaning it has no problem content. This likely applies to giving a more family friendly rating like E, hence its importance. 

---

# Test the Model

In [None]:
predict = model.predict(testChara) #Create predictions off the test set
rate = [] #Create a list to hold converted predictions

#For each prediction, translate it into ESRB Ratings
for pre in predict:
    #Try to get a value from the predictions
    try:
        #Take the predictions and multiply the labels to make unused ones null. Then filter out the new nulls to leave
        #    a single rating. Then convert that into a list to be able to extract the values. Finally, get the only value.
        rate.append(list(filter(None, pre * labels))[0])
        
    #Add the most common item if the model did not predict anything
    except:
        rate.append("T") #Give the most common value
        
corr = (rate == testRate) #Compare the ratings to see where it was correct
comp = {"Title" : testTitles, "Predicted" : rate, "Actual" : testRate, "Correct": corr} #Build a dictionary for the ratings and correctness lists
    
test = pd.DataFrame(comp) #Put the dictionary into a pandas dataframe. This could be converted into a further CSV from here. 
test.head() #Take a peek at the dataframe

In [None]:
print(test["Correct"].value_counts() / len(test)) #Check the amount correct

In [None]:
print(test.loc[test["Title"] == "Yooka-Laylee"]) #Print Yooka-Laylee as an example of how this could be searchable in a program

I had gone into this assuming the test set did not have the ratings, so the validation step was not inherently necessary. I would have just seen what it predicted without confirmation if it actually did not have the ratings. The way I did it still makes a visual representation of what it got right and wrong, which is interesting in itself. The main issues I am seeing are ones that were classified as one off of the actual rating, which could be the ESRB's humans causing some variance.

I have let the full dataset load below so you can go through it (if desired). I saved it for the end as to not bog down the rest of the content.

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None) #Disable limits
test #Load the full dataset