## This notebook prepares the training data used in training GPT-2

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import time
import json
import re
from collections import Counter
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

IMDb movie data is pulled into a dataframe

In [2]:
df = pd.read_csv("data/IMDb movies.csv", encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head(3)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0


In [4]:
df.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

Filter out incomplete descriptions

In [5]:
temp = df.loc[df.description.notnull()].copy()
df = temp[~temp.description.str.contains("\.{3}$")].copy()

In [6]:
training_data = list(df["description"].values)

Check for duplicates

In [7]:
n_plots = len(training_data)
n_unique = len(list(set(training_data)))
n_dupe = n_plots - n_unique
print("number of plots       :", n_plots)
print("number of unique plots:", n_unique)
print("number of duplicates  :", n_dupe)

number of plots       : 59929
number of unique plots: 59806
number of duplicates  : 123


Investigate duplicates

In [8]:
def get_most_common(val_list, n):
    """Return n most common values from the list val_list"""
    count_d = {}
    for v in val_list:
        if v in count_d.keys():
            count_d[v] += 1
        else:
            count_d[v] = 1
    k = Counter(count_d)
    return k.most_common(n)

In [9]:
get_most_common(training_data, 10)

[('The story of', 15),
 ('Mail', 6),
 ('In this sequel to', 5),
 ('Based on', 5),
 ('The true story of', 5),
 ('Emil goes to Berlin to see his grandmother with a large amount of money and is offered sweets by a strange man that make him sleep. He wakes up at his stop with no money. It is up to him and a group of children to save the day.',
  4),
 ('Tom Sawyer and his pal Huckleberry Finn have great adventures on the Mississippi River, pretending to be pirates, attending their own funeral and witnessing a murder.',
  4),
 ('During World War II, a teenage Jewish girl named Anne Frank and her family are forced into hiding in the Nazi-occupied Netherlands.',
  4),
 ('Desperate measures are taken by a man who tries to save his family from the dark side of the law, after they commit an unexpected crime.',
  4),
 ('Dr. Henry Jekyll experiments with scientific means of revealing the hidden, dark side of man and releases a murderer from within himself.',
  3)]

In [10]:
df[df.description.isin(["The story of", 
                        "In this sequel to"
                       ])
  ][["title", "year", "description"]]

Unnamed: 0,title,year,description
11147,Lawrence d'Arabia,1962,The story of
11176,Anna dei miracoli,1962,The story of
12292,Flagrante adulterio,1965,In this sequel to
18589,Oliver's Story,1978,In this sequel to
20403,Frances,1982,The story of
23930,Gorilla nella nebbia,1988,The story of
25300,Quei bravi ragazzi,1990,The story of
26206,Ritorno alla laguna blu,1991,In this sequel to
33741,Revelation,1999,In this sequel to
52421,Milk,2008,The story of


Filter out bad data

In [11]:
bad_plots = [i[0] for i in get_most_common(training_data, 5)]

In [12]:
training_data = [i for i in training_data if i not in bad_plots]

In [13]:
n_plots = len(training_data)
n_unique = len(list(set(training_data)))
n_dupe = n_plots - n_unique
print("number of plots       :", n_plots)
print("number of unique plots:", n_unique)
print("number of duplicates  :", n_dupe)

number of plots       : 59893
number of unique plots: 59801
number of duplicates  : 92


Remove all dupes

In [14]:
training_data = list(set(training_data))
print("number of unique plots:", len(training_data))

number of unique plots: 59801


Write plots to a text file, separated by a delimiter

In [15]:
delim = "<|endoftext|>"

In [16]:
plots = open("plot_training.txt", "w", encoding='utf-8')

for i in training_data:
    plots.write("{}\n\n{}\n\n".format(i, delim))

plots.close()

## Add a second dataset

In [17]:
df_other = pd.read_csv("data/movies_initial.csv", encoding='utf-8')

In [26]:
a = df_other.iloc[[2]]

In [28]:
s = "Oh shit, my pussy is on fir�!"
s = re.sub("�", "e", s)
s

'Oh shit, my pussy is on fire!'

In [30]:
df_other = df_other.loc[df_other.fullplot.notnull()]

In [31]:
training_data2 = list(df_other.fullplot.values)

Check for any duplicates

In [32]:
n_plots = len(training_data2)
n_unique = len(list(set(training_data2)))
n_dupe = n_plots - n_unique
print("number of plots       :", n_plots)
print("number of unique plots:", n_unique)
print("number of duplicates  :", n_dupe)

number of plots       : 41421
number of unique plots: 39987
number of duplicates  : 1434


In [33]:
get_most_common(training_data2, 8)

[('The plot is unknown.', 9),
 ('The plot is unknown at this time.', 9),
 ('Plot is unknown.', 9),
 ('In the middle of the 19th century, Kristina and Karl-Oskar live in a small rural village in Smaaland (southern Sweden). They get married and try to make a living on a small spot of land. However, the small size of their land, the infertile soil, and some bad harvests makes it tough. One of their children even starve to death. Thus, they decide to emigrate to the U.S. They meet a group of farmers with their families planing the emigration under the leadership of a banned priest. They sell everything and embark for the U.S. The journey on the sailing ship is long and tedious. Some of the emigrants will never reach the New World.',
  5),
 ("This film continues from where Utvandrarna (1971) left off. Starting a new life in the New World from almost nothing is not easy. The winters and summers are more extreme than in the Old World. But the immigrants are rewarded for their hard work. They 

In [34]:
df_other[df_other.fullplot.isin(["The plot is unknown.", 
                                 "The plot is unknown at this time."])][["title", "year", "plot", "fullplot"]]

Unnamed: 0,title,year,plot,fullplot
30340,Pee-wee's Big Holiday,2016,The plot is unknown.,The plot is unknown.
33821,Ghostbusters,2016,The plot is unknown at this time.,The plot is unknown at this time.
33822,Ghostbusters,2016,The plot is unknown at this time.,The plot is unknown at this time.
35318,Bad Boys 3,2017,The plot is unknown at this time.,The plot is unknown at this time.
36800,Pee-wee's Big Holiday,2016,The plot is unknown.,The plot is unknown.
40080,Untitled Spider-Man Reboot,2017,The plot is unknown.,The plot is unknown.
40407,Prometheus 2,2017,The plot is unknown at this time.,The plot is unknown at this time.
41058,Goon: Last of the Enforcers,2016,The plot is unknown at this time.,The plot is unknown at this time.
41245,The Ridiculous 6,2015,The plot is unknown at this time.,The plot is unknown at this time.
41755,Star Trek Beyond,2016,The plot is unknown at this time.,The plot is unknown at this time.


In [35]:
bad_plots2 = [i[0] for i in get_most_common(training_data2, 3)]

In [99]:
training_data2 = [i for i in training_data2 if i not in bad_plots2]

In [36]:
n_plots = len(training_data2)
n_unique = len(list(set(training_data2)))
n_dupe = n_plots - n_unique
print("number of plots       :", n_plots)
print("number of unique plots:", n_unique)
print("number of duplicates  :", n_dupe)

number of plots       : 41421
number of unique plots: 39987
number of duplicates  : 1434


In [37]:
df_other[df_other.fullplot=="In the middle of the 19th century, Kristina and Karl-Oskar live in a small rural village in Smaaland (southern Sweden). They get married and try to make a living on a small spot of land. However, the small size of their land, the infertile soil, and some bad harvests makes it tough. One of their children even starve to death. Thus, they decide to emigrate to the U.S. They meet a group of farmers with their families planing the emigration under the leadership of a banned priest. They sell everything and embark for the U.S. The journey on the sailing ship is long and tedious. Some of the emigrants will never reach the New World."][["title", "year", "plot", "fullplot"]]

Unnamed: 0,title,year,plot,fullplot
8441,The Emigrants,1971,"In the middle of the 19th century, Kristina an...","In the middle of the 19th century, Kristina an..."
26716,The Emigrants,1971,"In the middle of the 19th century, Kristina an...","In the middle of the 19th century, Kristina an..."
26717,The Emigrants,1971,"In the middle of the 19th century, Kristina an...","In the middle of the 19th century, Kristina an..."
26719,The Emigrants,1971,"In the middle of the 19th century, Kristina an...","In the middle of the 19th century, Kristina an..."
26720,The Emigrants,1971,"In the middle of the 19th century, Kristina an...","In the middle of the 19th century, Kristina an..."


In [38]:
training_data2 = list(set(training_data2))
print("number of unique plots:", len(training_data2))

number of unique plots: 39987


In [40]:
all_training_data = training_data + training_data2
print("Total training samples:", len(all_training_data))

Total training samples: 99788


Make sure there are no duplicates in the final list

In [41]:
all_training_data = list(set(all_training_data))
print("Total training samples:", len(all_training_data))

Total training samples: 95443


In [42]:
delim = "<|endoftext|>"

plots = open("plot_training.txt", "w", encoding='utf-8')

for i in all_training_data:
    plots.write("{}\n\n{}\n\n".format(i, delim))

plots.close()

# Add titles to training data

In [35]:
df = pd.read_csv("data/IMDb movies.csv")
df2 = pd.read_csv("data/movies_initial.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [36]:
temp = df.loc[df.description.notnull()].copy()
temp = temp.loc[temp.title.notnull()].copy()
df = temp[~temp.description.str.contains("\.{3}$")].copy()

temp2 = df2.loc[df2.fullplot.notnull()].copy()
df2 = temp2.loc[temp2.title.notnull()].copy()

In [37]:
df["training"] = df["title"] + "$" + df["description"]
df2["training"] = df2["title"] + "$" + df2["fullplot"]

In [38]:
training_data = list(df['training'].values)
training_data2 = list(df2['training'].values)

In [41]:
training_data2 = [re.sub("�", "e", i) for i in training_data2]

In [43]:
all_training_data = training_data + training_data2

In [44]:
print(len(all_training_data))

101350


In [45]:
all_training_data = list(set(all_training_data))
print(len(all_training_data))

97860


In [46]:
plots = open("title-plot_training.txt", "w", encoding='utf-8')

for i in all_training_data:
    splt = i.split("$")
    dat = "{}\n\n{}\n{}\n".format(splt[0], splt[1], "<|endoftext|>")
    plots.write(dat)

plots.close()