In [2]:
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("movies.csv", encoding='latin-1')
data.head()

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986


In [3]:
pd.options.display.float_format = '{:.2f}'.format
#pd.reset_option('^display.', silent=True)

In [4]:
#Data understanding
for column in data.columns:
    print(data[column].value_counts())

0.00            2182
20000000.00      207
30000000.00      191
25000000.00      177
15000000.00      170
10000000.00      155
40000000.00      150
35000000.00      149
50000000.00      115
5000000.00       113
60000000.00      107
12000000.00      102
6000000.00        90
8000000.00        88
18000000.00       88
13000000.00       76
7000000.00        73
3000000.00        73
45000000.00       72
17000000.00       65
80000000.00       65
70000000.00       65
11000000.00       62
16000000.00       60
14000000.00       58
22000000.00       57
4000000.00        55
75000000.00       54
55000000.00       54
100000000.00      49
                ... 
260000000.00       1
8900000.00         1
12800000.00        1
10920000.00        1
63600000.00        1
9200000.00         1
65000.00           1
126000000.00       1
29750000.00        1
17700000.00        1
176000000.00       1
169000.00          1
26350000.00        1
5600000.00         1
230000000.00       1
144000000.00       1
11800000.00  

# Prepearing the dataset for processing

In [5]:
#Get the names of movies with no budget and format the string for a URL
noBudgetMovies = data[data["budget"]==0][["year","name"]]
noBudgetMovies["name"] = noBudgetMovies["name"].str.replace(" ","_")
noBudgetMovies.head()

Unnamed: 0,year,name
25,1986,Short_Circuit
26,1986,The_Name_of_the_Rose
27,1986,Iron_Eagle
32,1986,Betty_Blue
35,1986,The_Karate_Kid_Part_II


### Requesting the missing budget values from Wikipedia API

In [6]:
from bs4 import BeautifulSoup
import requests

def getPage(movieName):
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    PARAMS = {
        "action": "parse",
        "page": movieName,
        "format": "json"
    }
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    return DATA

#Getting the budget value
def getBudget(DATA):
    try:
        soup = BeautifulSoup("<html>"+str(DATA["parse"]["text"]["*"])+"</html>",features="html.parser")
    except:
        return "parseError"
    table = soup.find("table",class_="infobox vevent")

    try:
        thList = table.find_all("th")
        tdList = table.find_all("td")
    except:
        return "redirect"

    budgetIndex = -1
    i=0
    for row in thList:
        if "Budget" in str(row):
            budgetIndex = i
            break
        i += 1

    if budgetIndex == -1:
        return "noBudget"
    
    budget = tdList[budgetIndex].contents
    
    try:
        if len(budget)==1 or len(budget)==2 and "reference" in budget[1]["class"]:
            return tdList[budgetIndex].contents[0]
    except:
        pass
    return budget

In [None]:
#Sends GET requests to wikipedia API to get the missing budget values
#Takes a long time ~35min
dict = {}

for movieName in noBudgetMovies["name"]:
    pageData = getPage(movieName)
    budget = getBudget(pageData)
    
    if budget=="redirect":
        name = movieName+"_(film)"
        pageData = getPage(name)
        budget = getBudget(pageData)
        if budget=="redirect":
            year = noBudgetMovies[noBudgetMovies["name"]==movieName]["year"].values[0]
            name = movieName+"_("+str(year)+"_film)"
            pageData = getPage(name)
            budget = getBudget(pageData)
            if budget=="redirect":
                dict[movieName] = "redirect. last tried name: "+name
                continue
    if budget=="noBudget":
        dict[movieName] = "noBudget"
        continue
    if budget=="parseError":
        dict[movieName] = "parseError"
        continue
    dict[movieName] = budget

In [None]:
#DO NOT RUN AGAIN IF YOU ALREADY HAVE RESULTS IN A FILE OR MAKE A BACKUP FIRST, BECAUSE API REQUESTS TAKE A LONG TIME TO GET!!!
#Writing dict to file for further processing. 
with open("wikiResults.txt","w",encoding="utf-8") as file:
    for key, value in dict.items():
        file.write(key+"==="+str(value).replace("\n"," ")+"\n")

### Take the budget values and parse them into floats

In [7]:
#Read the results from the file onto a new dictionary
dictFile = {}
with open("wikiResults.txt","r",encoding="utf-8") as file:
    lines = file.readlines()
for line in lines:
    if line=="":
        continue
    split = line.split("===")
    dictFile[split[0]] = split[1].replace("\n","")

In [8]:
def findNumericDigits(string):
    index = 0
    i =0
    for char in string:
        if char.isnumeric():
            index = i
            break
        i += 1
    return index

def parseBudgetValue(budget):
    currencies = {
        "£": 1.33,
        "€": 1.11,
        "AU$": 0.688
    }
    
    try:
        if "sup" in budget:
            return budget
        if "million" in budget or "Million" in budget:
            row = budget.split(" ")
            firstNr = findNumericDigits(row[0])
            value = budget[:firstNr]+str(float(row[0][firstNr:])*1000000)
        elif ","in budget:
            value = budget.replace(",","")
        else:
            value = budget

        if value[0] == "$":
            return float(value[1:])
        elif value[:3] == "US$":
            return float(value[3:])
        elif value[0] == "£":
            return float(value[1:])*currencies["£"]
        elif value[0] == "€":
            return float(value[1:])*currencies["€"]
        elif value[:3] == "AU$":
            return float(value[3:])*currencies["AU$"]
        elif value[:2] == "A$":
            return float(value[2:])*currencies["AU$"]
        else:
            return budget
    except:
        return budget

In [9]:
#Parse the value into a float
for key, value in dictFile.items():
    dictFile[key] = parseBudgetValue(value)

In [12]:
#Manual fixing of remaining errors
errors = ["noBudget","parseError","redirect"]
moviesWithErrors = []
for key, value in dictFile.items():
    if type(value)!= float and value not in errors:
        moviesWithErrors.append(key)
        print(key,value)
        budget = input("Enter new budget value: ")
        if budget == "":
            continue
        dictFile[key] = budget

Summer redirect. last tried name: Summer_(1986_film)
Enter new budget value: 
Link Unknown
Enter new budget value: 
Kangaroo redirect. last tried name: Kangaroo_(1986_film)
Enter new budget value: 
Wings_of_Desire ['5 million ', <a href="/wiki/Deutsche_Mark" title="Deutsche Mark">DM</a>, <sup class="reference" id="cite_ref-FOOTNOTELüdiLüdi200060_3-0"><a href="#cite_note-FOOTNOTELüdiLüdi200060-3">[3]</a></sup>]
Enter new budget value: 5 million Deutsche_Mark
Who's_That_Girl $17–20 million ($37.49 million in 2018)
Enter new budget value: $18.5 million
Wanted:_Dead_or_Alive $4,500,000 (US)
Enter new budget value: $4,500,000
Wisdom ['$6.5 million', <sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>, <sup class="reference" id="cite_ref-globe_3-0"><a href="#cite_note-globe-3">[3]</a></sup>]
Enter new budget value: $6.5 million
The_Allnighter approx $1 million
Enter new budget value: $1 million
The_Bedroom_Window ['$8.3 million', <sup class="reference" id="cite_ref-cu

Princess_Mononoke Japan
Enter new budget value: $23.5 million
Perfect_Blue [<span style="white-space: nowrap">¥90 million</span>, <sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>, ' (', <span style="white-space: nowrap">$6,875,200</span>, ')', <sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>]
Enter new budget value: $6,875,200
Open_Your_Eyes ESP 370 million
Enter new budget value: 
The_Castle redirect. last tried name: The_Castle_(1997_film)
Enter new budget value: 
Fireworks redirect. last tried name: Fireworks_(1997_film)
Enter new budget value: $2.3 million
Murder_at_1600 $40-50 million
Enter new budget value: $45 million
Lock,_Stock_and_Two_Smoking_Barrels <div class="plainlist"> <ul><li><a href="/wiki/Pound_sterling" title="Pound sterling">£</a>800,000</li> <li>(<a href="/wiki/United_States_dollar" title="United States dollar">$</a>1.35 million)</li></ul> </div>
Enter new budget value: $1.35 million
Show_Me_Love [<a href="/wiki

Enter new budget value: $27 million
Lady_Vengeance South Korea
Enter new budget value: $4.5 million
London redirect. last tried name: London_(2005_film)
Enter new budget value: 
Idiocracy $2–4 million
Enter new budget value: $3 million
The_Host South Korea
Enter new budget value: US$11 million
Tenacious_D_in_The_Pick_of_Destiny ['$19-20 million', <sup class="reference" id="cite_ref-nyt_1-0"><a href="#cite_note-nyt-1">[1]</a></sup>, <sup class="reference" id="cite_ref-mojo_2-0"><a href="#cite_note-mojo-2">[2]</a></sup>]
Enter new budget value: $19.5 million
Rang_De_Basanti [<span style="white-space: nowrap">₹</span>, '280', <span class="nowrap"> </span>, 'million', <sup class="reference" id="cite_ref-boxofficeindia.com_1-0"><a href="#cite_note-boxofficeindia.com-1">[1]</a></sup>]
Enter new budget value: 
The_Wind_That_Shakes_the_Barley ['€6.5 million', <sup class="reference" id="cite_ref-element_2-4"><a href="#cite_note-element-2">[2]</a></sup>, <sup class="reference" id="cite_ref-numbe

Enter new budget value: 
A_Hijacking ['DKK 15 million', <sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>, ' (approx. $2.6 million)']
Enter new budget value: $2.6 million
Short_Term_12 under $1 million
Enter new budget value: $1 million
Europa_Report Less than $10 million
Enter new budget value: $10 million
Austenland [<a href="/wiki/Pound_sterling" title="Pound sterling">£</a>, '4.9 million', <br/>, '(', <a href="/wiki/United_States_dollar" title="United States dollar">$</a>, '7.6 million)']
Enter new budget value: $7.6 million
In_a_World... Less than $1 million
Enter new budget value: $1 million
The_Tale_of_the_Princess_Kaguya Japan
Enter new budget value: $49 million
Jimi:_All_Is_by_My_Side ['$5 million', <sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>, <sup class="reference" id="cite_ref-TN_2-0"><a href="#cite_note-TN-2">[2]</a></sup>]
Enter new budget value: $5 million
The_Congress ['€8 million', <sup class="noprint Inline-Temp

In [20]:
#DO NOT RUN AGAIN. THIS WRITES THE MANUALLY INSERTED VALUES INTO A FILE
#Backup the dictFile with the new manually inserted values
with open("manuallyInserted.txt","w",encoding="utf-8") as file:
    for key, value in dictFile.items():
        file.write(key+"==="+str(value).replace("\n"," ")+"\n")

In [10]:
#Run after fresh start to get the manually fixed budget values
with open("manuallyInserted.txt","r",encoding="utf-8") as file:
    lines = file.readlines()
for line in lines:
    if line=="":
        continue
    split = line.split("===")
    dictFile[split[0]] = split[1].replace("\n","")

### Add the newfound values into the set

In [14]:
for key, value in dictFile.items():
    if type(value)==float:
        data[data["name"] == key.replace("_"," ")] = data[data["name"] == key.replace("_"," ")].replace(to_replace=0.0,value=value)

### Encoding labels

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

##strings to numbers
labelEncoder = LabelEncoder()
labelEncoder.fit(data['company'])
data['company'] = labelEncoder.transform(data['company'])

labelEncoder.fit(data['country'])
data['country'] = labelEncoder.transform(data['country'])

labelEncoder.fit(data['director'])
data['director'] = labelEncoder.transform(data['director'])

labelEncoder.fit(data['genre'])
data['genre'] = labelEncoder.transform(data['genre'])

labelEncoder.fit(data['name'])
data['name'] = labelEncoder.transform(data['name'])

labelEncoder.fit(data['rating'])
data['rating'] = labelEncoder.transform(data['rating'])

labelEncoder.fit(data['star'])
data['star'] = labelEncoder.transform(data['star'])

labelEncoder.fit(data['writer'])
data['writer'] = labelEncoder.transform(data['writer'])

labelEncoder.fit(data['released'])
data['released'] = labelEncoder.transform(data['released'])


data.head(20)

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,665,54,2192,1,52287414.0,4661,8,39,89,8.1,2454,299174,3716,1986
1,6000000.0,1680,54,1296,4,70136369.0,1821,7,27,103,7.8,1605,264740,1970,1986
2,15000000.0,1680,54,2645,0,179800601.0,6203,6,23,110,6.9,2345,236909,1852,1986
3,18500000.0,2062,54,1066,0,85160248.0,295,8,33,137,8.4,2192,540152,1629,1986
4,9000000.0,2122,54,2123,1,18564613.0,1872,6,36,90,6.9,1139,36636,2549,1986
5,6000000.0,1158,53,1948,6,138530565.0,3903,8,75,120,8.1,369,317585,2988,1986
6,25000000.0,1160,53,1207,1,12729917.0,2883,6,30,101,7.4,528,102879,977,1986
7,6000000.0,768,54,588,6,8551228.0,771,8,51,120,7.8,924,146768,897,1986
8,9000000.0,1680,54,1004,4,40471663.0,3964,7,9,96,6.8,1730,60565,1970,1986
9,15000000.0,1854,54,557,6,40456565.0,5306,8,38,96,7.5,1039,129698,1333,1986


### Drop all movies with no budget

In [16]:
data = data[data["budget"]!=0]

# Working with the data

In [17]:
labels = data["gross"]
movie_data = data.drop(["gross"],1)

X_train, X_test, y_train, y_test = train_test_split(movie_data, labels, test_size = 0.3, random_state = 0)

In [24]:
rf = RandomForestClassifier(criterion="gini",n_estimators=80, max_depth=160).fit(X_train, y_train)
predictions = rf.predict(X_test)
pList = predictions.tolist()

In [25]:
accuracyResults = pd.DataFrame(columns=["name","realGross","predictedGross","difference"])
for i in range(len(pList)):
    row = X_test.iloc[i]
    label = y_test.iloc[i]
    accuracyResults = accuracyResults.append({"name":row["name"],"realGross":label,"predictedGross":pList[i],"difference":abs(label-pList[i])},ignore_index=True)

with open("modelsUsed.txt","a",encoding="utf-8") as f:
    f.write("model==="+str(rf)+"\tmean==="+str(accuracyResults["difference"].mean())+"\n")
accuracyResults["difference"].describe()

count        1392.00
mean     33479617.27
std      52745095.20
min          1237.00
25%       5263446.75
50%      17734627.00
75%      40898209.75
max     863584125.00
Name: difference, dtype: float64

In [34]:
testDf = X_train.copy()
testDf.add(["test"])

ValueError: Unable to coerce to Series, length must be 14: given 1

In [24]:
predictedDF["gross"] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
