In [None]:
import os
import nltk
import math
import statistics
import numpy as np
nltk.download(["vader_lexicon"])
from nltk.sentiment import SentimentIntensityAnalyzer
from google.colab import drive
drive.mount('/content/gdrive')
from IPython.display import HTML, display
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >{value}
        </progress>
        <p>{value} / {max}</p>
    """.format(value=value, max=max))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
sent_analyzer = SentimentIntensityAnalyzer()

# Defined hyperparameters
num_plotparts = 5
chunk_size = 30
train_test_split = 0.8

# for debugging purposes, cut down number of genres
g_mn = 0
g_mx = 23

walk = os.walk("/content/gdrive/My Drive/imsdb_raw_nov_2015")
genre_dir_list = []
for x in walk:
  genre_dir_list.append(x[0])
genre_dir_list = genre_dir_list[1:]
genre_dir_list = genre_dir_list[g_mn:g_mx]

genres_and_size = []
for gd in genre_dir_list:
  genres_and_size.append([len(os.listdir(gd)), gd[gd.rfind("/")+1:]])
genres_and_size.sort(reverse=True)
for g in genres_and_size:
  print(g)

def get_features(text):
  poses = []
  negs = []
  neuts = []
  comps = []
  num_chunks = math.ceil(len(text) / chunk_size)
  num_chunks_per_plotpart = math.floor(num_chunks / num_plotparts)
  features = {}
  current_plotpart = 0

  for i in range(num_plotparts):
    for tp in ["pos", "neg", "neu", "compound"]:
      for st in ["mean", "stdev"]:
        features[str(i+1) + "-" + tp + "-" + st] = 0

  for i in range(num_chunks):
    chunk = ""
    for j in range(chunk_size):
      if i*chunk_size + j >= len(text):
        break
      chunk += text[i * chunk_size + j]
    sent = sent_analyzer.polarity_scores(chunk)
    poses.append(sent["pos"])
    negs.append(sent["neg"])
    neuts.append(sent["neu"])
    comps.append(sent["compound"])

    if (i % num_chunks_per_plotpart == num_chunks_per_plotpart - 1 and current_plotpart != num_plotparts - 1) or (i == num_chunks - 1):
      current_plotpart += 1

      features[str(current_plotpart) + "-pos-mean"] = statistics.mean(poses)
      features[str(current_plotpart) + "-pos-stdev"] = statistics.stdev(poses)
      features[str(current_plotpart) + "-neg-mean"] = statistics.mean(negs)
      features[str(current_plotpart) + "-neg-stdev"] = statistics.stdev(negs)
      features[str(current_plotpart) + "-neu-mean"] = statistics.mean(neuts)
      features[str(current_plotpart) + "-neu-stdev"] = statistics.stdev(neuts)
      features[str(current_plotpart) + "-compound-mean"] = statistics.mean(comps)
      features[str(current_plotpart) + "-compound-stdev"] = statistics.stdev(comps)
      poses = negs = neuts = comps = []
    
  return features

[579, 'Drama']
[373, 'Thriller']
[347, 'Comedy']
[290, 'Action']
[201, 'Crime']
[192, 'Romance']
[190, 'Adventure']
[155, 'Sci-Fi']
[149, 'Horror']
[113, 'Fantasy']
[107, 'Mystery']
[39, 'Family']
[35, 'Animation']
[26, 'War']
[22, 'Musical']
[13, 'Western']
[5, 'Music']
[4, 'Film-Noir']
[3, 'Short']
[3, 'History']
[3, 'Biography']
[2, 'Sport']


In [None]:
def write_feature_files(path, features):
  file1 = open(path, 'w')
  for f in features.items():
    s = f[0] + " " + str(f[1]) + "\n"
    file1.write(s)
  file1.close()

In [None]:
# This block processes all movie files in the range given by variables g_mn and g_mx, and stores them as files.
# This only needs to run once (takes approx. 40 mins for all genres). This was done in chunks of 5 genres
# so that the runtime on Google Colab doesn't quit on me

def process_features():
  print("Processing features into files\n")
  for g in genre_dir_list:
    print(g, len(os.listdir(g)))

  train_data = []
  test_data = []

  print("Number of files to read and process, by genre:")
  output_bars = []
  for i in range(len(genre_dir_list)):
    print("Genre: ", genre_dir_list[i][genre_dir_list[i].rfind("/")+1:])
    output_bars.append(display(progress(0, len(os.listdir(genre_dir_list[i]))), display_id=True))

  genre_iterator = 0
  
  for genre_dir in genre_dir_list:
    genre = genre_dir[genre_dir.rfind("/")+1:]
    file_list = os.listdir(genre_dir)
    iter_array = np.array(list(range(len(file_list))))
    np.random.shuffle(iter_array)

    train_iters = list(iter_array[:int(len(iter_array) * train_test_split)])
    test_iters = list(iter_array[int(len(iter_array) * train_test_split):])

    num_files_read = 0
    feature_genre_dir = "content/gdrive/My Drive/all-features-5-plotparts/" + genre
    if not os.path.exists(feature_genre_dir):
      os.makedirs(feature_genre_dir)

    for i in train_iters:
      text = []
      with open(genre_dir + "/" + file_list[i]) as f:
        text = f.readlines()
      features = get_features(text)
      train_data.append((features, genre))
      write_feature_files(feature_genre_dir + "/" + file_list[i], features)
      num_files_read += 1
      output_bars[genre_iterator].update(progress(num_files_read, len(file_list)))
    
    for i in test_iters:
      text = []
      with open(genre_dir + "/" + file_list[i]) as f:
        text = f.readlines()
      features = get_features(text)
      test_data.append((features, genre))
      write_feature_files(feature_genre_dir + "/" + file_list[i], features)
      num_files_read += 1
      output_bars[genre_iterator].update(progress(num_files_read, len(file_list)))

    print(genre, " had", len(train_iters), " training and", len(test_iters), " testing instances")
    genre_iterator += 1

#process_features()

In [None]:
import random

def predict_genres(genres_to_include, neg_to_pos_ratio, train_all):
  if len(genres_to_include) > 0:
    genre_dir_list = ["/content/gdrive/My Drive/all-features-5-plotparts/" + x + "-features" for x in genres_to_include]
    genre_list = [g for g in genres_to_include]
  else:
    walk = os.walk("/content/gdrive/My Drive/all-features-5-plotparts")
    genre_dir_list = []
    for x in walk:
      genre_dir_list.append(x[0])
    genre_dir_list = genre_dir_list[1:]
    genre_list = [genre_dir[genre_dir.rfind("/")+1:] for genre_dir in genre_dir_list]

  x_train = {}
  y_train = {}
  x_test = {}
  y_test = {}
  x_not_in_genre = {}
  for g in genre_list:
    x_not_in_genre[g] = []
  classifiers = {}

  for g in range(len(genre_list)):
    genre = genre_list[g]
    genre_dir = genre_dir_list[g]

    classifiers[genre] = LogisticRegression(solver='liblinear', random_state=0)
    x_train[genre] = []
    y_train[genre] = []
    x_test[genre] = []
    y_test[genre] = []

    file_list = os.listdir(genre_dir)
    iter_array = np.array(list(range(len(file_list))))
    np.random.shuffle(iter_array)

    train_iters = list(iter_array[:int(len(iter_array) * train_test_split)])
    test_iters = list(iter_array[int(len(iter_array) * train_test_split):])

    for i in train_iters:
      with open(genre_dir + "/" + file_list[i]) as f:
        text = f.readlines()
        features = [float(line.split()[1]) for line in text]
        x_train[genre].append(features)
        y_train[genre].append(1)
        
        for other_genre in genre_list:
          if other_genre != genre:
            x_not_in_genre[other_genre].append(features)

    for i in test_iters:
      with open(genre_dir + "/" + file_list[i]) as f:
        text = f.readlines()
        features = [float(line.split()[1]) for line in text]
        x_test[genre].append(features)
        y_test[genre].append(1)

        for other_genre in genre_list:
          if other_genre != genre:
            x_not_in_genre[other_genre].append(features)
    #print(genre, "data done training")
  for genre in genre_list:
    num_test_negs = len(x_test[genre])
    if train_all:
      num_train_negs = len(x_not_in_genre[genre]) - num_test_negs
    else:
      num_train_negs = min(int(len(x_train[genre]) * neg_to_pos_ratio), len(x_not_in_genre[genre])-num_test_negs)

    if num_train_negs < 0:
      num_test_negs = num_train_negs = int(len(x_not_in_genre[genre])/2)
    if num_train_negs == 0:
      return [0, 0, 0, 0]

    neg_features = np.array(x_not_in_genre[genre])
    np.random.shuffle(neg_features)
    neg_features = list(neg_features)


    for i in range(num_test_negs):
      x_test[genre].append(neg_features[i])
    for i in range(num_test_negs):
      y_test[genre].append(0)

    for i in range(num_test_negs, num_train_negs+num_test_negs):
      x_train[genre].append(neg_features[i])
    for i in range(num_train_negs):
      y_train[genre].append(0)
      
    classifiers[genre].fit(x_train[genre], y_train[genre])
    predicted = classifiers[genre].predict(x_test[genre])

    tp = tn = fp = fn = 0
    for i in range(len(predicted)):
      if predicted[i] == y_test[genre][i]:
        if predicted[i] == 1:
          tp += 1
        else:
          tn += 1
      else:
        if predicted[i] == 1:
          fp += 1
        else:
          fn += 1
    print(genre)
    print("\tTotal", tp+tn+fp+fn)
    print("\tTP, TN, FP, FN:", tp, tn, fp, fn)
    print("\tAccuracy:", (tp+tn+1)/(tp+tn+fp+fn+1))
    print("\tPrecision:", (tp+1)/(tp+fp+1))
    print("\tRecall:", (tp+1)/(tp+fn+1))



In [None]:
# Pairwise testing
all_pairs = []

def pairwise_testing(num_iters):

  for i in range(len(genres_and_size)):
    for j in range(i+1, len(genres_and_size)):
      values = [0, 0, 0, 0]
      for iter in range(num_iters):
        curr = predict_genres([genres_and_size[i][1], genres_and_size[j][1]], 1, False)
        for k in range(4):
          values[k] += curr[k]
      for k in range(4):
        values[k] /= num_iters
      accuracy = (values[0]+values[1]+1) / (values[0]+values[1]+values[2]+values[3]+1)
      tup = [accuracy, "(" + genres_and_size[i][1] + ", " + genres_and_size[j][1] + ")"]
      print(tup)
      all_pairs.append(tup)

  all_pairs.sort()

pairwise_testing(5)

[0.5656652360515021, '(Drama, Thriller)']
[0.4995708154506438, '(Drama, Comedy)']
[0.559656652360515, '(Drama, Action)']
[0.5021459227467812, '(Drama, Crime)']
[0.5021459227467812, '(Drama, Romance)']
[0.5021459227467812, '(Drama, Adventure)']
[0.5021459227467812, '(Drama, Sci-Fi)']
[0.5021459227467812, '(Drama, Horror)']
[0.6763005780346821, '(Drama, Fantasy)']
[0.6882352941176471, '(Drama, Mystery)']
[0.8602941176470589, '(Drama, Family)']
[0.8731343283582089, '(Drama, Animation)']
[0.9, '(Drama, War)']
[0.9140625, '(Drama, Musical)']
[0.9512195121951219, '(Drama, Western)']
[0.9831932773109243, '(Drama, Music)']
[0.9831932773109243, '(Drama, Film-Noir)']
[0.9915254237288136, '(Drama, Short)']
[0.9915254237288136, '(Drama, History)']
[0.9915254237288136, '(Drama, Biography)']
[0.9915254237288136, '(Drama, Sport)']
[0.7430463576158941, '(Thriller, Comedy)']
[0.5245033112582782, '(Thriller, Action)']
[0.5033112582781457, '(Thriller, Crime)']
[0.713907284768212, '(Thriller, Romance)']
[

In [None]:
genres_cut = [genres_and_size[i][1] for i in range(11)]

all_pairs_cut = []
for i in all_pairs:
  pair = i[1]
  commaindex = pair.find(",")
  g1 = pair[1:commaindex]
  g2 = pair[commaindex + 2:len(pair)-1]
  if g1 in genres_cut and g2 in genres_cut:
    all_pairs_cut.append(i)

for i in all_pairs_cut:
  print(i)

[0.4995708154506438, '(Drama, Comedy)']
[0.500709219858156, '(Comedy, Mystery)']
[0.5015873015873016, '(Sci-Fi, Mystery)']
[0.5021459227467812, '(Drama, Adventure)']
[0.5021459227467812, '(Drama, Crime)']
[0.5021459227467812, '(Drama, Horror)']
[0.5021459227467812, '(Drama, Romance)']
[0.5021459227467812, '(Drama, Sci-Fi)']
[0.5033112582781457, '(Thriller, Adventure)']
[0.5033112582781457, '(Thriller, Crime)']
[0.5033112582781457, '(Thriller, Fantasy)']
[0.5033112582781457, '(Thriller, Horror)']
[0.5033112582781457, '(Thriller, Mystery)']
[0.5033112582781457, '(Thriller, Sci-Fi)']
[0.5035460992907801, '(Comedy, Fantasy)']
[0.5035460992907801, '(Comedy, Romance)']
[0.5042735042735043, '(Action, Horror)']
[0.5042735042735043, '(Action, Mystery)']
[0.5042735042735043, '(Action, Sci-Fi)']
[0.5060240963855421, '(Crime, Fantasy)']
[0.5060240963855421, '(Crime, Mystery)']
[0.5064935064935064, '(Adventure, Fantasy)']
[0.5064935064935064, '(Adventure, Mystery)']
[0.5076923076923077, '(Action, F

In [None]:
#predict_genres(["Romance", "Horror"], 1, False) #USE THIS

"""
39 110 149
Romance
	Total 78
	TP, TN, FP, FN: 34 31 8 5
	Accuracy: 0.8354430379746836
	Precision: 0.813953488372093
	Recall: 0.875
30 119 192
Horror
	Total 60
	TP, TN, FP, FN: 27 25 5 3
	Accuracy: 0.8688524590163934
	Precision: 0.8484848484848485
	Recall: 0.9032258064516129
  """

#predict_genres(["Action", "Crime", "Adventure", "Sci-Fi"], 1, False)

"""
predict_genres(["Horror", "Romance", "Musical"], 1, False) #USE THIS
predict_genres(["Horror", "Romance"], 1, False) #USE THIS
predict_genres(["Action", "Crime", "Adventure", "Sci-Fi"], 1, False)
predict_genres(["Action", "Romance", "Crime", "Horror"], 1, False)

"""

"""Action
	Total 116
	TP, TN, FP, FN: 26 37 21 32
	Accuracy: 0.5470085470085471
	Precision: 0.5625
	Recall: 0.4576271186440678
41 160 635
Crime
	Total 82
	TP, TN, FP, FN: 22 25 16 19
	Accuracy: 0.5783132530120482
	Precision: 0.5897435897435898
	Recall: 0.5476190476190477
38 152 646
Adventure
	Total 76
	TP, TN, FP, FN: 17 23 15 21
	Accuracy: 0.5324675324675324
	Precision: 0.5454545454545454
	Recall: 0.46153846153846156
31 124 681
Sci-Fi
	Total 62
	TP, TN, FP, FN: 12 13 18 19
	Accuracy: 0.4126984126984127
	Precision: 0.41935483870967744
	Recall: 0.40625"""

#predict_genres(["Sci-Fi", "Action"], 1, False)
"""
Sci-Fi
	Total 62
	TP, TN, FP, FN: 16 20 11 15
	Accuracy: 0.5873015873015873
	Precision: 0.6071428571428571
	Recall: 0.53125
Action
	Total 116
	TP, TN, FP, FN: 58 0 58 0
	Accuracy: 0.5042735042735043
	Precision: 0.5042735042735043
	Recall: 1.0"""


Sci-Fi
	Total 62
	TP, TN, FP, FN: 16 20 11 15
	Accuracy: 0.5873015873015873
	Precision: 0.6071428571428571
	Recall: 0.53125
Action
	Total 116
	TP, TN, FP, FN: 58 0 58 0
	Accuracy: 0.5042735042735043
	Precision: 0.5042735042735043
	Recall: 1.0
