In [4]:
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("movies.csv", encoding='latin-1')
data.head()

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986


In [5]:
#Data understanding
for column in data.columns:
    print(data[column].value_counts())

0.0            2182
20000000.0      207
30000000.0      191
25000000.0      177
15000000.0      170
               ... 
29000001.0        1
128000000.0       1
24500000.0        1
300000.0          1
5800000.0         1
Name: budget, Length: 351, dtype: int64
Universal Pictures                        302
Warner Bros.                              294
Paramount Pictures                        259
Twentieth Century Fox Film Corporation    205
New Line Cinema                           172
                                         ... 
Magnum Pictures Inc.                        1
Armory Films                                1
Budapest Stúdió Vállalat                    1
Buena Vista Pictures                        1
DJ Films                                    1
Name: company, Length: 2179, dtype: int64
USA                               4872
UK                                 698
France                             283
Canada                             150
Germany                            1

In [5]:
#Get the names of movies with no budget and format the string for a URL
noBudgetMovies = data[data["budget"]==0][["year","name"]]
noBudgetMovies["name"] = noBudgetMovies["name"].str.replace(" ","_")
noBudgetMovies.head()

Unnamed: 0,year,name
25,1986,Short_Circuit
26,1986,The_Name_of_the_Rose
27,1986,Iron_Eagle
32,1986,Betty_Blue
35,1986,The_Karate_Kid_Part_II


In [4]:
from bs4 import BeautifulSoup
import requests
import time

def getPage(movieName):
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    PARAMS = {
        "action": "parse",
        "page": movieName,
        "format": "json"
    }
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    return DATA

#Getting the budget value
def getBudget(DATA):
    try:
        soup = BeautifulSoup("<html>"+str(DATA["parse"]["text"]["*"])+"</html>",features="html.parser")
    except:
        return "parseError"
    table = soup.find("table",class_="infobox vevent")

    try:
        thList = table.find_all("th")
        tdList = table.find_all("td")
    except:
        return "redirect"

    budgetIndex = -1
    i=0
    for row in thList:
        if "Budget" in str(row):
            budgetIndex = i
            break
        i += 1

    if budgetIndex == -1:
        return "noBudget"
    
    return tdList[budgetIndex].contents[0]


errors = []
budgets = []

#errors2 = []
#budgets2 = []

#print(getBudget(getPage("The_Karate_Kid_Part_II")))

#print(getPage("The_Name_of_the_Rose")["parse"]["text"]["*"])

for movieName in noBudgetMovies["name"]:
    #print(movieName)
    pageData = getPage(movieName)
    #print(pageData["parse"]["text"]["*"])
    #time.sleep(5)
    if pageData=="redirect":
        errors.append("Table was NoneType. probably wrong page. Movie: "+movieName)
        continue
    if pageData=="noBudget":
        errors.append("No budget found. Movie: "+movieName)
        continue
    if pageData=="parseError":
        errors.append("No page found. Movie: "+movieName)
        continue
#     if len(pageData["parse"]["text"]["*"]) < 10000:
#         errors.append("Length is under 10000. Movie: "+movieName)
#         continue
    budget = getBudget(pageData)
    budgets.append(budget)

KeyboardInterrupt: 

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

##strings to numbers
labelEncoder = LabelEncoder()
labelEncoder.fit(data['company'])
data['company'] = labelEncoder.transform(data['company'])

labelEncoder.fit(data['country'])
data['country'] = labelEncoder.transform(data['country'])

labelEncoder.fit(data['director'])
data['director'] = labelEncoder.transform(data['director'])

labelEncoder.fit(data['genre'])
data['genre'] = labelEncoder.transform(data['genre'])

labelEncoder.fit(data['name'])
data['name'] = labelEncoder.transform(data['name'])

labelEncoder.fit(data['rating'])
data['rating'] = labelEncoder.transform(data['rating'])

labelEncoder.fit(data['star'])
data['star'] = labelEncoder.transform(data['star'])

labelEncoder.fit(data['writer'])
data['writer'] = labelEncoder.transform(data['writer'])

labelEncoder.fit(data['released'])
data['released'] = labelEncoder.transform(data['released'])


data.head(20)

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,665,54,2192,1,52287414.0,4661,8,39,89,8.1,2454,299174,3716,1986
1,6000000.0,1680,54,1296,4,70136369.0,1821,7,27,103,7.8,1605,264740,1970,1986
2,15000000.0,1680,54,2645,0,179800601.0,6203,6,23,110,6.9,2345,236909,1852,1986
3,18500000.0,2062,54,1066,0,85160248.0,295,8,33,137,8.4,2192,540152,1629,1986
4,9000000.0,2122,54,2123,1,18564613.0,1872,6,36,90,6.9,1139,36636,2549,1986
5,6000000.0,1158,53,1948,6,138530565.0,3903,8,75,120,8.1,369,317585,2988,1986
6,25000000.0,1160,53,1207,1,12729917.0,2883,6,30,101,7.4,528,102879,977,1986
7,6000000.0,768,54,588,6,8551228.0,771,8,51,120,7.8,924,146768,897,1986
8,9000000.0,1680,54,1004,4,40471663.0,3964,7,9,96,6.8,1730,60565,1970,1986
9,15000000.0,1854,54,557,6,40456565.0,5306,8,38,96,7.5,1039,129698,1333,1986


In [None]:
labels = data["name"]
movie_data = data.drop(["gross"],1)

X_train, X_test, y_train, y_test = train_test_split(movie_data, labels, test_size = 0.3, random_state = 0)

#predicting
rf = RandomForestClassifier(n_estimators = 200).fit(X_train, y_train)
predictions = rf.predict(X_test)

predictions.tolist()

In [None]:
X_test["gross"] = predictions