In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.preprocessing import FunctionTransformer
import joblib
from sklearn.metrics import classification_report

In [13]:
def get_script_from_id(id):
    script = open('../data/script/' + id + '.script', 'r').read()
    # print(script)
    script = script.replace("'", " ").replace('"', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\b', ' ').replace('\\', ' ')
    return script

def get_pd_dataframe():
    inputFile = open('../data_gathering/baseline/output/imdb_id_with_age_rating_and_labels.txt')
    df_data = []
    for line in inputFile:
        line_data = line.strip().split(',')
        # print(line_data)
        line_data.append(int(line_data[3]) + int(line_data[4]) + int(line_data[5]) + int(line_data[6]))
        
        max_index = 0
        max_value = 0
        for i in range(3,7):
            vote_count = int(line_data[i])
            if(vote_count >= max_value):
                max_index = i - 3
                max_value = vote_count
        line_data.append(max_index)
        try:
            script = get_script_from_id(line_data[0])
        except:
            # print('Error on loading script for id: ' + line_data[0])
            continue
        line_data.append(script)
        df_data.append(line_data)

    # id | Aspect | None | Mild | Moderate | Severe | Total_votes | Aspect_rating | text
    df = pd.DataFrame(df_data, columns=['imdb_id', 'age_rating', 'aspect', 'votes none', 'votes mild', 'votes moderate', 'votes severe', 'total_votes', 'aspect_rating', 'text'])
    df.drop(columns=["age_rating"], inplace=True)
    df = df.astype({'votes mild':'int', 'votes moderate':'int', 'votes severe':'int', 'votes none':'int', 'total_votes':'int', 'aspect_rating':'int'})
    return df

In [14]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# Define a custom transformer to preprocess the text column
class TextPreprocessor(TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_processed = []
        for text in X:
            # Convert to lowercase and remove punctuation
            text = text.lower().translate(str.maketrans('', '', string.punctuation))
            # Tokenize the text
            tokens = nltk.word_tokenize(text)
            # Remove stop words and lemmatize the remaining words
            processed_tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
            # Join the processed tokens back into a single string
            processed_text = ' '.join(processed_tokens)
            X_processed.append(processed_text)
        return X_processed


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leonremke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/leonremke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leonremke/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
class SVMMovieClassifier():

    def __init__(self, df, model_name, num_features):

        self.df = df
        # Define the column transformer
        self.num_features = num_features
        self.num_transformer = Pipeline(steps=[
            ('normalize', FunctionTransformer(lambda x: x[num_features].div(x['total_votes'], axis=0), validate=False)),
            ('scale', StandardScaler())
        ])

        self.text_transformer = Pipeline(steps=[
            ('preprocess', TextPreprocessor()),
            ('vectorize', TfidfVectorizer())
        ])

        self.preprocessor = ColumnTransformer(transformers=[
            # ('num', self.num_transformer, num_features),
            ('text', self.text_transformer, 'text')
        ])

        # Define the pipeline
        self.pipe = Pipeline(steps=[
            ('preprocess', self.preprocessor),
            ('clf', SVC())
        ])

        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def split_data(self):
        # Split the data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.df.drop('aspect_rating', axis=1), self.df['aspect_rating'], test_size=0.2, random_state=42)

    def train(self):
        # Train the pipeline
        self.pipe.fit(self.X_train, self.y_train)

    def test(self, X_test = None, y_test = None):
        # Test the pipeline
        if X_test is None or y_test is None:
            X_test = self.X_test
            y_test = self.y_test

        y_pred = self.pipe.predict(X_test)
        print(classification_report(y_test, y_pred))

    def predict_and_save(self, X, save_path = "./", save_name = "prediction.csv"):
        # Predict the labels
        y_pred = self.pipe.predict(X)
        X['prediction'] = y_pred
        X.drop(columns=['text'], inplace=True)
        X.to_csv(save_path + save_name, index=False)

    def predict(self, X):
        # Predict the labels
        y_pred = self.pipe.predict(X)
        return y_pred

    def save_model(self, model_name):
        # Save the pipeline
        joblib.dump(self.pipe, model_name)

    def load_model(self, model_name):
        # Load the pipeline
        self.pipe = joblib.load(model_name)

In [32]:
# Load the data
df = get_pd_dataframe()

# Drop rows with less than 5 votes
rows_to_drop = df.loc[df['total_votes'] < 5].index
df = df.drop(rows_to_drop)

# group the dataframe by 'aspect' and create a dictionary of dataframes
df_dict = {aspect: aspect_df.drop('aspect', axis=1) for aspect, aspect_df in df.groupby('aspect')}

aspect_classifier_dict = {}

# print the dictionary of dataframes
for aspect, aspect_df in df_dict.items():
    print(aspect, len(aspect_df))
    # Limit the number of rows to 50 for testing
    aspect_classifier_dict[aspect] = SVMMovieClassifier(aspect_df, 'movie_svm_' + aspect, ['votes none', 'votes mild', 'votes moderate', 'votes severe', 'total_votes'])


In [17]:
# Train the model
for aspect, movie_classifier in aspect_classifier_dict.items():
    try:
        print('Training model for aspect: ' + aspect)
        movie_classifier.split_data()
        movie_classifier.train()
    except Exception as e:
        print('Error on training model for aspect: ' + aspect)
        print(e)

Training model for aspect: alcohol
Training model for aspect: frightening
Training model for aspect: nudity
Training model for aspect: profanity
Training model for aspect: violence


In [18]:
# #Test the model
# for aspect, movie_classifier in aspect_classifier_dict.items():
#     try:
#         print('Testing model for aspect: ' + aspect)
#         movie_classifier.test()
#     except Exception as e:
#         print('Error on testing model for aspect: ' + aspect + ' Error: ' + str(e))
#         print(e)

In [34]:
for aspect, movie_classifier in aspect_classifier_dict.items():
    final_df = df_dict["frightening"].drop(columns=["text", "aspect_rating"])
    try:
        print('Predicting model for aspect: ' + aspect)
        y_pred = movie_classifier.predict(df_dict[aspect])
        df_dict[aspect][f'prediction_{aspect}'] = y_pred
    except Exception as e:
        print('Error on predicting model for aspect: ' + aspect + ' Error: ' + str(e))
        print(e)

Predicting model for aspect: alcohol
Predicting model for aspect: frightening
Predicting model for aspect: nudity
Predicting model for aspect: profanity
Predicting model for aspect: violence


In [38]:
from functools import reduce

df_list = list(df_dict.values())

# Define the common columns on which the data frames will be merged

for df in df_list:
    df.drop(columns=["text", "aspect_rating", "votes none", "votes mild", "votes moderate", "votes severe", "total_votes"], inplace=True)
# Apply the merge function to all data frames in the list
merged_df = reduce(lambda left, right: pd.merge(left, right, how="inner", on="imdb_id"), df_list)
merged_df.head()

Unnamed: 0,imdb_id,prediction_alcohol,prediction_frightening,prediction_nudity,prediction_profanity,prediction_violence
0,tt0032138,1,2,1,1,2
1,tt0035423,1,1,0,1,0
2,tt0038650,1,1,0,1,1
3,tt0047396,1,1,1,0,1
4,tt0048545,1,2,0,0,2


In [39]:
merged_df.to_csv("../data/results_svm/svm_prediction.csv", index=False)