In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.preprocessing import FunctionTransformer
import joblib
from sklearn.metrics import classification_report

In [2]:
def get_script_from_id(id):
    script = open('../data/script/' + id + '.script', 'r').read()
    # print(script)
    script = script.replace("'", " ").replace('"', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\b', ' ').replace('\\', ' ')
    return script

def get_pd_dataframe():
    inputFile = open('../data_gathering/baseline/output/imdb_id_with_age_rating_and_labels.txt')
    df_data = []
    for line in inputFile:
        line_data = line.strip().split(',')
        # print(line_data)
        line_data.append(int(line_data[3]) + int(line_data[4]) + int(line_data[5]) + int(line_data[6]))
        
        max_index = 0
        max_value = 0
        for i in range(3,7):
            vote_count = int(line_data[i])
            if(vote_count >= max_value):
                max_index = i - 3
                max_value = vote_count
        line_data.append(max_index)
        try:
            script = get_script_from_id(line_data[0])
        except:
            # print('Error on loading script for id: ' + line_data[0])
            continue
        line_data.append(script)
        df_data.append(line_data)

    # id | Aspect | None | Mild | Moderate | Severe | Total_votes | Aspect_rating | text
    df = pd.DataFrame(df_data, columns=['imdb_id', 'age_rating', 'aspect', 'votes none', 'votes mild', 'votes moderate', 'votes severe', 'total_votes', 'aspect_rating', 'text'])
    df.drop(columns=["age_rating"], inplace=True)
    df = df.astype({'votes mild':'int', 'votes moderate':'int', 'votes severe':'int', 'votes none':'int', 'total_votes':'int', 'aspect_rating':'int'})
    return df

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# Define a custom transformer to preprocess the text column
class TextPreprocessor(TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_processed = []
        for text in X:
            # Convert to lowercase and remove punctuation
            text = text.lower().translate(str.maketrans('', '', string.punctuation))
            # Tokenize the text
            tokens = nltk.word_tokenize(text)
            # Remove stop words and lemmatize the remaining words
            processed_tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
            # Join the processed tokens back into a single string
            processed_text = ' '.join(processed_tokens)
            X_processed.append(processed_text)
        return X_processed


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/I518302/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/I518302/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/I518302/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
class SVMMovieClassifier():

    def __init__(self, df, model_name, num_features):

        self.df = df
        # Define the column transformer
        self.num_features = 1 #num_features
        self.num_transformer = Pipeline(steps=[
            ('normalize', FunctionTransformer(lambda x: x[num_features].div(x['total_votes'], axis=0), validate=False)),
            ('scale', StandardScaler())
        ])

        self.text_transformer = Pipeline(steps=[
            ('preprocess', TextPreprocessor()),
            ('vectorize', TfidfVectorizer())
        ])

        self.preprocessor = ColumnTransformer(transformers=[
            ('num', self.num_transformer, num_features),
            ('text', self.text_transformer, 'text')
        ])

        # Define the pipeline
        self.pipe = Pipeline(steps=[
            ('preprocess', self.preprocessor),
            ('clf', SVC())
        ])

        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def split_data(self):
        # Split the data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.df['text'], self.df['aspect_rating'], test_size=0.2, random_state=42)

    def train(self):
        # Train the pipeline
        self.pipe.fit(self.X_train, self.y_train)

    def test(self, X_test = None, y_test = None):
        # Test the pipeline
        if X_test is None or y_test is None:
            X_test = self.X_test
            y_test = self.y_test

        y_pred = self.pipe.predict(X_test)
        print(classification_report(y_test, y_pred))

    def save_model(self, model_name):
        # Save the pipeline
        joblib.dump(self.pipe, model_name)

    def load_model(self, model_name):
        # Load the pipeline
        self.pipe = joblib.load(model_name)

In [5]:
# Load the data
df = get_pd_dataframe()

# Drop rows with less than 5 votes
rows_to_drop = df.loc[df['total_votes'] < 5].index
df = df.drop(rows_to_drop)

# group the dataframe by 'aspect' and create a dictionary of dataframes
df_dict = {aspect: aspect_df.drop('aspect', axis=1) for aspect, aspect_df in df.groupby('aspect')}

aspect_classifier_dict = {}

# print the dictionary of dataframes
for aspect, aspect_df in df_dict.items():
    print(aspect, len(aspect_df))
    # Limit the number of rows to 50 for testing
    aspect_df = aspect_df[:50]
    aspect_classifier_dict[aspect] = SVMMovieClassifier(aspect_df, 'movie_svm_' + aspect, ['votes none', 'votes mild', 'votes moderate', 'votes severe', 'total_votes'])


alcohol 477
frightening 475
nudity 478
profanity 477
violence 477


In [8]:
# Train the model
for aspect, movie_classifier in aspect_classifier_dict.items():
    try:
        print('Training model for aspect: ' + aspect)
        movie_classifier.split_data()
        movie_classifier.train()
    except Exception as e:
        print('Error on training model for aspect: ' + aspect)
        print(e)

Training model for aspect: alcohol
Error on training model for aspect: alcohol
tuple index out of range
Training model for aspect: frightening
Error on training model for aspect: frightening
tuple index out of range
Training model for aspect: nudity
Error on training model for aspect: nudity
tuple index out of range
Training model for aspect: profanity
Error on training model for aspect: profanity
tuple index out of range
Training model for aspect: violence
Error on training model for aspect: violence
tuple index out of range


In [7]:
# Test the model
for aspect, movie_classifier in aspect_classifier_dict.items():
    try:
        print('Testing model for aspect: ' + aspect)
        movie_classifier.test()
    except Exception as e:
        print('Error on testing model for aspect: ' + aspect + ' Error: ' + str(e))
        print(e)

Testing model for aspect: alcohol
Error on testing model for aspect: alcohol Error: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
Testing model for aspect: frightening
Error on testing model for aspect: frightening Error: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
Testing model for aspect: nudity
Error on testing model for aspect: nudity Error: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
Testing model for aspect: profanit

In [57]:
import numpy as np

In [60]:
np.random.seed(100)

In [61]:
df_dict['violence'].head()

Unnamed: 0,imdb_id,aspect_rating,text
1,tt0032138,1,FADE IN -- Title: For nearly forty years this...
6,tt0035423,0,KATE AND ...
11,tt0038650,1,IT S A WOND...
16,tt0047396,1,REAR WI...
21,tt0048545,2,REBEL ...


In [62]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score



In [63]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
for aspect, aspect_df in df_dict.items():
    #aspect_df.drop(columns=['votes none', 'votes mild', 'votes moderate', 'votes severe', 'total_votes'], inplace=True)
    text_transformer = Pipeline(steps=[('preprocess', TextPreprocessor())])
    text = aspect_df['text']
    preproccessed_text = text_transformer.fit_transform(text)
    Train_X, Test_X, Train_Y, Test_Y = train_test_split(preproccessed_text, aspect_df['aspect_rating'],test_size=0.3)

    Encoder = LabelEncoder()
    Train_Y = Encoder.fit_transform(Train_Y)
    Test_Y = Encoder.fit_transform(Test_Y)

    Tfidf_vect = TfidfVectorizer(max_features=2000)
    Tfidf_vect.fit(preproccessed_text)

    Train_X_Tfidf = Tfidf_vect.transform(Train_X)
    Test_X_Tfidf = Tfidf_vect.transform(Test_X)

    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto', random_state=42)
    SVM.fit(Train_X_Tfidf,Train_Y)

    predictions_SVM = SVM.predict(Test_X_Tfidf)
    print("SVM Accuracy Score of", aspect, ":", accuracy_score(predictions_SVM, Test_Y)*100,"%")

    rf_classifier = RandomForestClassifier(max_depth=3, max_features=2000, random_state=42)
    rf_classifier.fit(Train_X_Tfidf, Train_Y)
    y_pred = rf_classifier.predict(Test_X_Tfidf)

    accuracy = accuracy_score(Test_Y, y_pred)
    print("Random Forest Accuracy Score of", aspect, ":", accuracy*100,"%")

SVM Accuracy Score of alcohol : 63.888888888888886 %
Random Forest Accuracy Score of alcohol : 68.05555555555556 %
SVM Accuracy Score of frightening : 38.46153846153847 %
Random Forest Accuracy Score of frightening : 46.85314685314685 %
SVM Accuracy Score of nudity : 43.75 %
Random Forest Accuracy Score of nudity : 46.52777777777778 %
SVM Accuracy Score of profanity : 37.5 %
Random Forest Accuracy Score of profanity : 65.27777777777779 %
SVM Accuracy Score of violence : 35.41666666666667 %
Random Forest Accuracy Score of violence : 51.388888888888886 %
