In [50]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [51]:
def get_script_from_id(id):
    script = open('../data/script/' + id + '.script', 'r').read()
    # print(script)
    script = script.replace("'", " ").replace('"', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\b', ' ').replace('\\', ' ')
    return script

def get_pd_dataframe():
    inputFile = open('../data_gathering/baseline/output/imdb_id_with_age_rating_and_labels.txt')
    df_data = []
    for line in inputFile:
        line_data = line.strip().split(',')
        # print(line_data)
        line_data.append(int(line_data[3]) + int(line_data[4]) + int(line_data[5]) + int(line_data[6]))
        
        max_index = 0
        max_value = 0
        for i in range(3,7):
            vote_count = int(line_data[i])
            if(vote_count >= max_value):
                max_index = i - 3
                max_value = vote_count
        line_data.append(max_index)
        try:
            script = get_script_from_id(line_data[0])
        except:
            # print('Error on loading script for id: ' + line_data[0])
            continue
        line_data.append(script)
        df_data.append(line_data)

    # id | Aspect | None | Mild | Moderate | Severe | Total_votes | Aspect_rating | text
    df = pd.DataFrame(df_data, columns=['imdb_id', 'age_rating', 'aspect', 'votes none', 'votes mild', 'votes moderate', 'votes severe', 'total_votes', 'aspect_rating', 'text'])
    df.drop(columns=["age_rating"], inplace=True)
    df = df.astype({'votes mild':'int', 'votes moderate':'int', 'votes severe':'int', 'votes none':'int', 'total_votes':'int', 'aspect_rating':'int'})
    return df

In [52]:
df = get_pd_dataframe()[:1000]

# Drop rows with less than 5 votes
rows_to_drop = df.loc[df['total_votes'] < 5].index
df = df.drop(rows_to_drop)

num_features = ['votes none', 'votes mild', 'votes moderate', 'votes severe']
df[num_features] = df[num_features].div(df['total_votes'], axis=0)

df.head()

Unnamed: 0,imdb_id,aspect,votes none,votes mild,votes moderate,votes severe,total_votes,aspect_rating,text
0,tt0032138,nudity,0.894231,0.028846,0.0,0.076923,104,0,FADE IN -- Title: For nearly forty years this...
1,tt0032138,violence,0.23,0.61,0.08,0.08,100,1,FADE IN -- Title: For nearly forty years this...
2,tt0032138,profanity,0.905263,0.042105,0.0,0.052632,95,0,FADE IN -- Title: For nearly forty years this...
3,tt0032138,alcohol,0.817204,0.107527,0.010753,0.064516,93,0,FADE IN -- Title: For nearly forty years this...
4,tt0032138,frightening,0.1,0.63,0.17,0.1,100,1,FADE IN -- Title: For nearly forty years this...


In [53]:
vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(df['text'])

In [54]:
num_df = df[num_features].reset_index(drop=True)
X = pd.concat([pd.DataFrame(X_text.toarray()), num_df], axis=1)

In [55]:
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

In [56]:
y = df['aspect_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = SVC()
clf.fit(X_train, y_train)



SVC()

In [57]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       0.97      1.00      0.99        74
           2       1.00      0.97      0.98        64
           3       1.00      1.00      1.00        38

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200



In [58]:
clf.predict(X_test).tolist()



[1,
 2,
 1,
 1,
 3,
 1,
 1,
 0,
 3,
 0,
 2,
 1,
 0,
 2,
 3,
 0,
 2,
 1,
 2,
 1,
 2,
 3,
 3,
 1,
 1,
 0,
 1,
 1,
 3,
 3,
 2,
 3,
 0,
 1,
 3,
 1,
 2,
 1,
 3,
 2,
 3,
 1,
 3,
 1,
 1,
 2,
 2,
 3,
 1,
 1,
 0,
 3,
 2,
 1,
 0,
 1,
 0,
 1,
 3,
 2,
 1,
 2,
 0,
 2,
 1,
 2,
 2,
 0,
 3,
 2,
 2,
 2,
 3,
 1,
 2,
 3,
 2,
 1,
 2,
 2,
 1,
 0,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 0,
 2,
 3,
 2,
 2,
 1,
 1,
 3,
 3,
 1,
 3,
 2,
 0,
 3,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 2,
 3,
 0,
 1,
 1,
 3,
 2,
 1,
 2,
 1,
 3,
 3,
 2,
 1,
 2,
 3,
 1,
 2,
 3,
 1,
 1,
 1,
 0,
 1,
 2,
 0,
 0,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 3,
 1,
 0,
 1,
 2,
 2,
 3,
 0,
 0,
 2,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 3,
 1,
 1,
 2,
 1,
 3,
 2,
 0,
 2,
 2,
 1,
 0,
 1,
 1,
 2,
 1,
 2,
 3,
 1,
 2,
 2,
 1,
 2,
 1,
 3,
 1,
 2,
 3,
 0,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1]