In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Clean up the car in movies raw dataset for 2010

### Create list of vehicles from the car sales data, for filtering cars in movies

In [None]:
# read in car sales data to use as a filter for the Car column
sales = pd.read_csv('output_data/Car_Sales_2009-2020_edited.csv')

# list comprehension to apply lower method to string elements in the Vehicle column
lower_cars = sales.copy()
lower_cars.Vehicle = [car.lower() for car in lower_cars.Vehicle]

# create list of cars with sales data
vehicles = lower_cars.Vehicle.unique().tolist()

# create empty list to store vehicle strings to remove
remove_veh = []

# loop through list of vehicles to create separate strings for vehicles with two titles
for vehicle in vehicles:
    if '/' in vehicle:
        # find combined terms to distinguish, separate on / delimiter
        veh_split = vehicle.split('/')
        # only first index string has brand attached, join brand to second index string
        veh_second = " ".join([veh_split[0].split(' ')[0], veh_split[1]])
        # attach both vehicle strings to the vehicles list...
        vehicles.append(veh_split[0])
        vehicles.append(veh_second)
        # ...and append original string to a list for deletion later
        remove_veh.append(vehicle)

# loop through remove_vehicle list to delete strings with / delimiter
for vehicle in remove_veh:
    vehicles.remove(vehicle)

# edit vehicles list using conditionals...
for x in range(len(vehicles)):
    if 'bmw' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('-series', '')
    elif 'mercedes-benz' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('-class', '')
    elif 'land rover' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('land rover', 'land-rover')
    elif 'ford' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('series', '')
    else:
        pass
    
# append three exceptions that have been left from the code
exceptions = ['scion im', 'scion fr-s', 'scion ia']
vehicles.extend(exceptions)

# join the list of vehicles using a | delimiter, creating a RegEx pattern to pass through str.contains() next cell
vehicle_var = "|".join(vehicles)

print(vehicles)

### Create sales DataFrame for processing car data

### Create function that returns final DataFrame for visualizations

In [None]:
# read in film data
movies = pd.read_csv('output_data/Top100_Movies_2010-2021.csv')

In [None]:
# read in raw data for cars in movies
raw_2010 = pd.read_csv("output_data/raw/cars in movies/2021_03_06_2010Cars.csv")

# return a DataFrame without any missing ratings element
clean_2010 = raw_2010.loc[raw_2010.Stars != 'Nan']

# convert Stars ratings column to integers
clean_2010.Stars = clean_2010.Stars.astype(int)

# return a new DataFrame that removes irrelevant car elements
new = clean_2010.loc[(~clean_2010.Car.str.contains('Unknown'))\
                     & (~clean_2010.Car.str.contains('unknown')) & (~clean_2010.Car.str.contains('Movie'))]

# reset the index
new = new.reset_index(drop=True)

# remove whitespaces at both sides of the string elements
for index, row in new.iterrows():
    new.Car[index] = new.Car[index].strip()
    
# list comprehension to apply lower method to string elements in the Car column
lower_car_mov = new.copy()
lower_car_mov.Car = [car.lower() for car in lower_car_mov.Car]

# create DataFrame where only the cars from our car sales data are included
cars_filtered = lower_car_mov.loc[lower_car_mov.Car.str.contains(vehicle_var)]

# create list from the Car column
cars_list = cars_filtered.Car.tolist()

# create a dummy list to fill with 'vehicles' data
vehicles_list = np.arange(0, len(cars_filtered), 1).tolist()

# reconstruct dummy list to contain the reference string for each Car element
for vehicle in vehicles:
    for x in range(len(cars_list)):
        if vehicle in cars_list[x]:
            vehicles_list[x] = vehicle

# remove column url
cars_filtered = cars_filtered.drop(['url'], axis = 1)
cars_filtered['Vehicle'] = vehicles_list

# create a DataFrame of only films from the top 2010 grossing films
df10 = movies.loc[movies.Year == 2010]

# create a DataFrame that merges the car and movie data
car_mov = pd.merge(cars_filtered, df10, how="left", on="Release")

# create a clean DataFrame that only keeps relevant columns
clean_car_mov = car_mov[['Release', 'Car', 'Stars', 'Vehicle', 'Rank', 'Gross', 'Distributor', 'Year_y']]
# rename the column for Year_y
clean_car_mov.rename(columns = {'Year_y':'Year'}, inplace = True)

# create lists/columns for the various counts of Vehicle appearances
appear_high = [clean_car_mov.loc[(clean_car_mov.Stars >= 3) & (clean_car_mov.Vehicle == x)].Vehicle.count() for x in vehicles_list]
appear_low = [clean_car_mov.loc[(clean_car_mov.Stars < 3) & (clean_car_mov.Vehicle == x)].Vehicle.count() for x in vehicles_list]
appear_total = [clean_car_mov.loc[clean_car_mov.Vehicle == x].Vehicle.count() for x in vehicles_list]

clean_car_mov['Strong Appearance'] = appear_high
clean_car_mov['Weak Appearance'] = appear_low
clean_car_mov['Total Appearance'] = appear_total

# create list/column for the Releases where the Vehicle had the strongest appearance
best_ranks = []
highest_star = []
for vehicle in vehicles_list:
    best_star = clean_car_mov.loc[clean_car_mov.Vehicle == vehicle].Stars.max()
    highest_star.append(best_star)
    
    best_rank = clean_car_mov.loc[(clean_car_mov.Vehicle == vehicle) & (clean_car_mov.Stars == best_star)]['Rank'].tolist()
    best_rank.sort()
    best_ranks.append(best_rank[0])
    
clean_car_mov['Best Rank'] = best_ranks
clean_car_mov['Highest Star'] = highest_star

# create empty dataframe to append to
clean_car_columns = clean_car_mov.columns.tolist()
final_df = pd.DataFrame(columns = clean_car_columns)

# creat list of unique cars, sorted
unique_cars = clean_car_mov.Vehicle.unique()
unique_cars.sort()

# append each unique car's data to the empty DataFrame
for vehicle in unique_cars:
    unique_df = clean_car_mov.loc[(clean_car_mov.Vehicle == vehicle)\
                                  & (clean_car_mov.Rank == clean_car_mov['Best Rank'])\
                                  & (clean_car_mov.Stars == clean_car_mov['Highest Star'])]
    final_df = final_df.append(unique_df)

# dropping duplicate values
final_df.drop_duplicates(subset ="Vehicle", keep='first', inplace=True)

# reset the index
final_df = final_df.reset_index(drop=True)




# create DataFrame from copy of sales data
car_sales_2010 = lower_cars[['Vehicle', '2009', '2010']]

# dropping the rows having NaN values
car_sales_2010 = car_sales_2010.dropna()

# create a % Change column
car_sales_2010['% Change'] = (car_sales_2010['2010'] - car_sales_2010['2009']) / car_sales_2010['2009']

### Exploratory work for the code above

In [2]:
# read in raw data for cars in movies
raw_2010 = pd.read_csv("output_data/raw/cars in movies/2021_03_06_2010Cars.csv")
raw_2010

Unnamed: 0,Release,Year,Car,url,Stars
0,Toy Story 3,2010,1998 Chevrolet Corvette C5,vehicle_369776-Chevrolet-Corvette-1998.html,3
1,Toy Story 3,2010,Opel Meriva [A],vehicle_974788-Opel-Meriva.html,2
2,Toy Story 3,2010,2005 Opel Zafira [B],vehicle_974791-Opel-Zafira-2005.html,2
3,Toy Story 3,2010,1979 Toyota Truck,vehicle_349573-Toyota-Truck-RN30-1979.html,2
4,Iron Man 2,2010,2010 Acura ZDX,vehicle_340464-Acura-ZDX-YB1-2010.html,2
...,...,...,...,...,...
2048,Daybreakers,2010,Rhino Buggies Hammer,vehicle_317794-Rhino-Buggies-Hammer.html,3
2049,Daybreakers,2010,2004 smart Fortwo,vehicle_317791-smart-Fortwo-450-2004.html,1
2050,Daybreakers,2010,1993 Toyota 4Runner,vehicle_236774-Toyota-4Runner-N130-1993.html,2
2051,Daybreakers,2010,2006 Toyota Prius II,vehicle_317792-Toyota-Prius-NHW20-2006.html,1


In [3]:
# how many null elements are in each column
raw_2010.isnull().sum()

Release    0
Year       0
Car        0
url        0
Stars      0
dtype: int64

In [4]:
# how common are some cars at the most frequent versus least
raw_2010.Car.value_counts()

1999 Ford Crown Victoria               22
2001 Ford Crown Victoria               17
unknown                                15
2003 Ford Crown Victoria               13
1997 Ford Econoline                     9
                                       ..
2009 Audi A5 Sportback B8               1
1947 Chevrolet Advance-Design 6700      1
Yamaha unknown                          1
Armet Gurkha F5                         1
2004 Scion xB                           1
Name: Car, Length: 1472, dtype: int64

In [5]:
# top 20 most movie appearances
raw_2010.Car.value_counts().nlargest(20)

1999 Ford Crown Victoria                      22
2001 Ford Crown Victoria                      17
unknown                                       15
2003 Ford Crown Victoria                      13
1997 Ford Econoline                            9
2006 Toyota Prius II                           9
2006 Chevrolet Impala                          9
2005 Chrysler 300                              8
2007 Chevrolet Suburban                        8
2003 Lincoln Town Car                          8
2000 Chevrolet Impala                          8
2008 Chevrolet Malibu                          7
1998 Ford Crown Victoria                       7
2006 Ford Crown Victoria                       7
2007 Chevrolet Tahoe                           6
2007 Cadillac Escalade                         6
2007 Nissan Altima                             6
2006 Dodge Charger                             6
1996 Honda Accord                              6
2003 Lincoln Town Car Stretched Limousine      6
Name: Car, dtype: in

In [6]:
# check data types
raw_2010.dtypes

Release    object
Year        int64
Car        object
url        object
Stars      object
dtype: object

In [7]:
# how many cars are there for each ratings category
raw_2010.Stars.value_counts()

1      1372
2       443
3       185
Nan      32
4        21
Name: Stars, dtype: int64

In [8]:
# return a DataFrame without any missing ratings element
clean_2010 = raw_2010.loc[raw_2010.Stars != 'Nan']

In [9]:
# convert Stars ratings column to integers
clean_2010.Stars = clean_2010.Stars.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [10]:
# return a new DataFrame that removes irrelevant car elements
new = clean_2010.loc[(~clean_2010.Car.str.contains('Unknown'))\
                     & (~clean_2010.Car.str.contains('unknown')) & (~clean_2010.Car.str.contains('Movie'))]

In [11]:
# reset the index and display
new = new.reset_index(drop=True)
new

Unnamed: 0,Release,Year,Car,url,Stars
0,Toy Story 3,2010,1998 Chevrolet Corvette C5,vehicle_369776-Chevrolet-Corvette-1998.html,3
1,Toy Story 3,2010,Opel Meriva [A],vehicle_974788-Opel-Meriva.html,2
2,Toy Story 3,2010,2005 Opel Zafira [B],vehicle_974791-Opel-Zafira-2005.html,2
3,Toy Story 3,2010,1979 Toyota Truck,vehicle_349573-Toyota-Truck-RN30-1979.html,2
4,Iron Man 2,2010,2010 Acura ZDX,vehicle_340464-Acura-ZDX-YB1-2010.html,2
...,...,...,...,...,...
1982,Daybreakers,2010,Rhino Buggies Hammer,vehicle_236776-Rhino-Buggies-Hammer.html,2
1983,Daybreakers,2010,Rhino Buggies Hammer,vehicle_317794-Rhino-Buggies-Hammer.html,3
1984,Daybreakers,2010,2004 smart Fortwo,vehicle_317791-smart-Fortwo-450-2004.html,1
1985,Daybreakers,2010,1993 Toyota 4Runner,vehicle_236774-Toyota-4Runner-N130-1993.html,2


In [12]:
# how many unique vehicles are there
len(new.Car.unique())

1444

In [13]:
# in order to start working with the Car column, remove whitespaces at both sides of the string elements
for index, row in new.iterrows():
    new.Car[index] = new.Car[index].strip()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new.Car[index] = new.Car[index].strip()


### Use car sales data to further filter cars in movies

In [14]:
# read in car sales data to use as a filter for the Car column
sales = pd.read_csv('output_data/Car_Sales_2009-2020_edited.csv')
sales

Unnamed: 0,Vehicle,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Acura ILX,,,,12251.0,20430.0,17854.0,18531.0,14597.0,11757.0,11273.0,14685.0,13414.0
1,Acura MDX,31178.0,47210.0,43271.0,50854.0,53040.0,65603.0,58208.0,55495.0,54886.0,51512.0,52019.0,47816.0
2,Acura NSX,,,,,,,,269.0,581.0,170.0,238.0,128.0
3,Acura RDX,10153.0,14975.0,15196.0,29520.0,44750.0,44865.0,51026.0,52361.0,51295.0,63580.0,62876.0,52785.0
4,Acura RL,2043.0,2037.0,1096.0,379.0,39.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,Volvo V70,1816.0,895.0,,,,,,,,,,
453,Volvo XC40,,,,,,,,,,12420.0,17647.0,23778.0
454,Volvo XC60,9262.0,12030.0,12932.0,19139.0,19766.0,19276.0,26134.0,20452.0,22516.0,32689.0,30578.0,32078.0
455,Volvo XC70,5825.0,6626.0,5716.0,5513.0,4882.0,5093.0,5118.0,5425.0,,,,


In [15]:
# list comprehension to apply lower method to string elements in the Car column...
# ...to better filter the data for vehicles in the car sales dataset
lower_cars = sales.copy()
lower_cars.Vehicle = [car.lower() for car in lower_cars.Vehicle]

In [16]:
# same operations for the car in movie dataset
lower_car_mov = new.copy()
lower_car_mov.Car = [car.lower() for car in lower_car_mov.Car]

In [17]:
# create list of cars with sales data
vehicles = lower_cars.Vehicle.unique().tolist()

In [18]:
# create empty list to store vehicle strings to remove
remove_veh = []

# loop through list of vehicles to create separate strings for vehicles with two titles
for vehicle in vehicles:
    if '/' in vehicle:
        # find combined terms to distinguish, separate on / delimiter
        veh_split = vehicle.split('/')
        # only first index string has brand attached, join brand to second index string
        veh_second = " ".join([veh_split[0].split(' ')[0], veh_split[1]])
        # attach both vehicle strings to the vehicles list...
        vehicles.append(veh_split[0])
        vehicles.append(veh_second)
        # ...and append original string to a list for deletion later
        remove_veh.append(vehicle)

# loop through remove_vehicle list to delete strings with / delimiter
for vehicle in remove_veh:
    vehicles.remove(vehicle)

In [19]:
# edit vehicles list using conditionals...
# ...allowing list to catch every car from the sales dataset
for x in range(len(vehicles)):
    if 'bmw' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('-series', '')
    elif 'mercedes-benz' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('-class', '')
    elif 'land rover' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('land rover', 'land-rover')
    elif 'ford' in vehicles[x]:
        vehicles[x] = vehicles[x].replace('series', '')
    else:
        pass

In [20]:
# append three exceptions that have been left from the code
exceptions = ['scion im', 'scion fr-s', 'scion ia']
vehicles.extend(exceptions)

In [21]:
print(vehicles)

['acura ilx', 'acura mdx', 'acura nsx', 'acura rdx', 'acura rl', 'acura rlx', 'acura tl', 'acura tlx', 'acura tsx', 'acura zdx', 'alfa romeo 4c', 'alfa romeo giulia', 'alfa romeo stelvio', 'audi a3', 'audi a3 e-tron', 'audi a4', 'audi a4 allroad', 'audi a4 avant', 'audi a4 sedan', 'audi a5', 'audi a6', 'audi a7', 'audi a8', 'audi q3', 'audi q5', 'audi q7', 'audi q8', 'audi r8', 'audi tt', 'audi etron', 'bmw 1', 'bmw 2', 'bmw 3', 'bmw 4', 'bmw 5', 'bmw 6', 'bmw 7', 'bmw 8', 'bmw x1', 'bmw x2', 'bmw x3', 'bmw x4', 'bmw x5', 'bmw x6', 'bmw x7', 'bmw z4', 'bmw i3', 'bmw i8', 'bentley bentayga', 'bentley continental gt', 'bentley flying spur', 'bentley mulsanne', 'buick cascada', 'buick enclave', 'buick encore', 'buick encore gx', 'buick envision', 'buick lacrosse', 'buick lucerne', 'buick regal', 'buick verano', 'cadillac ats', 'cadillac ct4', 'cadillac ct5', 'cadillac ct6', 'cadillac cts', 'cadillac dts', 'cadillac elr', 'cadillac escalade', 'cadillac srx', 'cadillac sts', 'cadillac xlr',

In [22]:
# join the list of vehicles using a | delimiter, creating a RegEx pattern to pass through str.contains() next cell
vehicle_var = "|".join(vehicles)

In [23]:
# create DataFrame where only the cars from our car sales data are included
cars_filtered = lower_car_mov.loc[lower_car_mov.Car.str.contains(vehicle_var)]
cars_filtered

Unnamed: 0,Release,Year,Car,url,Stars
0,Toy Story 3,2010,1998 chevrolet corvette c5,vehicle_369776-Chevrolet-Corvette-1998.html,3
4,Iron Man 2,2010,2010 acura zdx,vehicle_340464-Acura-ZDX-YB1-2010.html,2
6,Iron Man 2,2010,2008 audi a8 l d3,vehicle_299564-Audi-A8-L-Typ-4E-2008.html,3
7,Iron Man 2,2010,2010 audi r8 spyder,vehicle_239415-Audi-R8-Spyder-Typ-42-2010.html,3
8,Iron Man 2,2010,bmw 3 [e90],vehicle_339991-BMW-3-E90.html,1
...,...,...,...,...,...
1974,Daybreakers,2010,1986 dodge ram van,vehicle_236772-Dodge-Ram-Van-1986.html,2
1977,Daybreakers,2010,1966 ford mustang,vehicle_306723-Ford-Mustang-1966.html,1
1984,Daybreakers,2010,2004 smart fortwo,vehicle_317791-smart-Fortwo-450-2004.html,1
1985,Daybreakers,2010,1993 toyota 4runner,vehicle_236774-Toyota-4Runner-N130-1993.html,2


In [24]:
# export both datasets for quick comparisons in excel, given their manageable size
#cars_filtered.to_csv("cars_filtered.csv", index=False, header=True)
#cars_remainder.to_csv("cars_remainder.csv", index=False, header=True)

In [25]:
# create list from the Car column
cars_list = cars_filtered.Car.tolist()

In [26]:
# create a dummy list to fill with 'vehicles' data
vehicles_list = np.arange(0, len(cars_filtered), 1).tolist()

In [27]:
# reconstruct dummy list to contain the reference string for each Car element
for vehicle in vehicles:
    for x in range(len(cars_list)):
        if vehicle in cars_list[x]:
            vehicles_list[x] = vehicle

In [28]:
# remove column url
cars_filtered = cars_filtered.drop(['url'], axis = 1)

cars_filtered['Vehicle'] = vehicles_list
cars_filtered

Unnamed: 0,Release,Year,Car,Stars,Vehicle
0,Toy Story 3,2010,1998 chevrolet corvette c5,3,chevrolet corvette
4,Iron Man 2,2010,2010 acura zdx,2,acura zdx
6,Iron Man 2,2010,2008 audi a8 l d3,3,audi a8
7,Iron Man 2,2010,2010 audi r8 spyder,3,audi r8
8,Iron Man 2,2010,bmw 3 [e90],1,bmw 3
...,...,...,...,...,...
1974,Daybreakers,2010,1986 dodge ram van,2,dodge ram
1977,Daybreakers,2010,1966 ford mustang,1,ford mustang
1984,Daybreakers,2010,2004 smart fortwo,1,smart fortwo
1985,Daybreakers,2010,1993 toyota 4runner,2,toyota 4runner


In [29]:
# read in film data
movies = pd.read_csv('output_data/Top100_Movies_2010-2021.csv')

In [30]:
# create a DataFrame of only films from the top 2010 grossing films
df10 = movies.loc[movies.Year == 2010]
df10

Unnamed: 0,Rank,Release,Gross,Theaters,Total Gross,Release Date,Distributor,Year
0,1,Avatar,"$466,141,929",3461,"$749,766,139",Dec 18,Twentieth Century Fox,2010
1,2,Toy Story 3,"$415,004,880",4028,"$415,004,880",Jun 18,Walt Disney Studios Motion Pictures,2010
2,3,Alice in Wonderland,"$334,191,110",3739,"$334,191,110",Mar 5,Walt Disney Studios Motion Pictures,2010
3,4,Iron Man 2,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,2010
4,5,The Twilight Saga: Eclipse,"$300,531,751",4468,"$300,531,751",Jun 30,Summit Entertainment,2010
...,...,...,...,...,...,...,...,...
95,96,Scott Pilgrim vs. the World,"$31,524,275",2820,"$31,524,275",Aug 13,Universal Pictures,2010
96,97,Charlie St. Cloud,"$31,162,545",2725,"$31,162,545",Jul 30,Universal Pictures,2010
97,98,Love & Other Drugs,"$31,099,481",2458,"$32,367,005",Nov 24,Twentieth Century Fox,2010
98,99,Morning Glory,"$30,727,814",2544,"$31,011,732",Nov 10,Paramount Pictures,2010


In [31]:
# create a DataFrame that merges the car and movie data
car_mov = pd.merge(cars_filtered, df10, how="left", on="Release")
car_mov

Unnamed: 0,Release,Year_x,Car,Stars,Vehicle,Rank,Gross,Theaters,Total Gross,Release Date,Distributor,Year_y
0,Toy Story 3,2010,1998 chevrolet corvette c5,3,chevrolet corvette,2,"$415,004,880",4028,"$415,004,880",Jun 18,Walt Disney Studios Motion Pictures,2010
1,Iron Man 2,2010,2010 acura zdx,2,acura zdx,4,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,2010
2,Iron Man 2,2010,2008 audi a8 l d3,3,audi a8,4,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,2010
3,Iron Man 2,2010,2010 audi r8 spyder,3,audi r8,4,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,2010
4,Iron Man 2,2010,bmw 3 [e90],1,bmw 3,4,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
1150,Daybreakers,2010,1986 dodge ram van,2,dodge ram,100,"$30,101,577",2523,"$30,101,577",Jan 8,Lionsgate,2010
1151,Daybreakers,2010,1966 ford mustang,1,ford mustang,100,"$30,101,577",2523,"$30,101,577",Jan 8,Lionsgate,2010
1152,Daybreakers,2010,2004 smart fortwo,1,smart fortwo,100,"$30,101,577",2523,"$30,101,577",Jan 8,Lionsgate,2010
1153,Daybreakers,2010,1993 toyota 4runner,2,toyota 4runner,100,"$30,101,577",2523,"$30,101,577",Jan 8,Lionsgate,2010


In [32]:
# create a clean DataFrame that only keeps relevant columns
clean_car_mov = car_mov[['Release', 'Car', 'Stars', 'Vehicle', 'Rank', 'Gross', 'Distributor', 'Year_y']]
# rename the column for Year_y
clean_car_mov.rename(columns = {'Year_y':'Year'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [33]:
clean_car_mov

Unnamed: 0,Release,Car,Stars,Vehicle,Rank,Gross,Distributor,Year
0,Toy Story 3,1998 chevrolet corvette c5,3,chevrolet corvette,2,"$415,004,880",Walt Disney Studios Motion Pictures,2010
1,Iron Man 2,2010 acura zdx,2,acura zdx,4,"$312,433,331",Paramount Pictures,2010
2,Iron Man 2,2008 audi a8 l d3,3,audi a8,4,"$312,433,331",Paramount Pictures,2010
3,Iron Man 2,2010 audi r8 spyder,3,audi r8,4,"$312,433,331",Paramount Pictures,2010
4,Iron Man 2,bmw 3 [e90],1,bmw 3,4,"$312,433,331",Paramount Pictures,2010
...,...,...,...,...,...,...,...,...
1150,Daybreakers,1986 dodge ram van,2,dodge ram,100,"$30,101,577",Lionsgate,2010
1151,Daybreakers,1966 ford mustang,1,ford mustang,100,"$30,101,577",Lionsgate,2010
1152,Daybreakers,2004 smart fortwo,1,smart fortwo,100,"$30,101,577",Lionsgate,2010
1153,Daybreakers,1993 toyota 4runner,2,toyota 4runner,100,"$30,101,577",Lionsgate,2010


In [34]:
# create lists/columns for the various counts of Vehicle appearances
appear_high = [clean_car_mov.loc[(clean_car_mov.Stars >= 3) & (clean_car_mov.Vehicle == x)].Vehicle.count() for x in vehicles_list]
appear_low = [clean_car_mov.loc[(clean_car_mov.Stars < 3) & (clean_car_mov.Vehicle == x)].Vehicle.count() for x in vehicles_list]
appear_total = [clean_car_mov.loc[clean_car_mov.Vehicle == x].Vehicle.count() for x in vehicles_list]

clean_car_mov['Strong Appearance'] = appear_high
clean_car_mov['Weak Appearance'] = appear_low
clean_car_mov['Total Appearance'] = appear_total
clean_car_mov

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_car_mov['Strong Appearance'] = appear_high
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_car_mov['Weak Appearance'] = appear_low
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_car_mov['Total Appearance'] = appear_total


Unnamed: 0,Release,Car,Stars,Vehicle,Rank,Gross,Distributor,Year,Strong Appearance,Weak Appearance,Total Appearance
0,Toy Story 3,1998 chevrolet corvette c5,3,chevrolet corvette,2,"$415,004,880",Walt Disney Studios Motion Pictures,2010,2,1,3
1,Iron Man 2,2010 acura zdx,2,acura zdx,4,"$312,433,331",Paramount Pictures,2010,0,1,1
2,Iron Man 2,2008 audi a8 l d3,3,audi a8,4,"$312,433,331",Paramount Pictures,2010,1,1,2
3,Iron Man 2,2010 audi r8 spyder,3,audi r8,4,"$312,433,331",Paramount Pictures,2010,2,0,2
4,Iron Man 2,bmw 3 [e90],1,bmw 3,4,"$312,433,331",Paramount Pictures,2010,0,20,20
...,...,...,...,...,...,...,...,...,...,...,...
1150,Daybreakers,1986 dodge ram van,2,dodge ram,100,"$30,101,577",Lionsgate,2010,0,11,11
1151,Daybreakers,1966 ford mustang,1,ford mustang,100,"$30,101,577",Lionsgate,2010,1,14,15
1152,Daybreakers,2004 smart fortwo,1,smart fortwo,100,"$30,101,577",Lionsgate,2010,0,5,5
1153,Daybreakers,1993 toyota 4runner,2,toyota 4runner,100,"$30,101,577",Lionsgate,2010,0,4,4


In [35]:
# create list/column for the Releases where the Vehicle had the strongest appearance
best_ranks = []
highest_star = []
for vehicle in vehicles_list:
    best_star = clean_car_mov.loc[clean_car_mov.Vehicle == vehicle].Stars.max()
    highest_star.append(best_star)
    
    best_rank = clean_car_mov.loc[(clean_car_mov.Vehicle == vehicle) & (clean_car_mov.Stars == best_star)]['Rank'].tolist()
    best_rank.sort()
    best_ranks.append(best_rank[0])
    
clean_car_mov['Best Rank'] = best_ranks
clean_car_mov['Highest Star'] = highest_star

In [36]:
# create empty dataframe to append to
clean_car_columns = clean_car_mov.columns.tolist()
final_df = pd.DataFrame(columns = clean_car_columns)
final_df

Unnamed: 0,Release,Car,Stars,Vehicle,Rank,Gross,Distributor,Year,Strong Appearance,Weak Appearance,Total Appearance,Best Rank,Highest Star


In [37]:
len(clean_car_mov.Vehicle.unique())

214

In [38]:
# creat list of unique cars, sorted
unique_cars = clean_car_mov.Vehicle.unique()
unique_cars.sort()

# append each unique car's data to the empty DataFrame
for vehicle in unique_cars:
    unique_df = clean_car_mov.loc[(clean_car_mov.Vehicle == vehicle)\
                                  & (clean_car_mov.Rank == clean_car_mov['Best Rank'])\
                                  & (clean_car_mov.Stars == clean_car_mov['Highest Star'])]
    final_df = final_df.append(unique_df)

In [39]:
final_df

Unnamed: 0,Release,Car,Stars,Vehicle,Rank,Gross,Distributor,Year,Strong Appearance,Weak Appearance,Total Appearance,Best Rank,Highest Star
75,The Other Guys,2007 acura mdx,1,acura mdx,18,"$119,219,978",Sony Pictures Entertainment (SPE),2010,0,2,2,18,1
76,The Other Guys,1999 acura tl,1,acura tl,18,"$119,219,978",Sony Pictures Entertainment (SPE),2010,0,3,3,18,1
220,Jackass 3D,2009 acura tsx,1,acura tsx,21,"$117,052,883",Paramount Pictures,2010,0,1,1,21,1
1,Iron Man 2,2010 acura zdx,2,acura zdx,4,"$312,433,331",Paramount Pictures,2010,0,1,1,4,2
750,Letters to Juliet,2009 audi a3 cabriolet,2,audi a3,63,"$53,032,453",Summit Entertainment,2010,0,2,2,63,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,The Other Guys,1999 volvo s80,1,volvo s80,18,"$119,219,978",Sony Pictures Entertainment (SPE),2010,0,2,2,18,1
648,The Last Song,1997 volvo v70,3,volvo v70,51,"$62,950,384",Walt Disney Studios Motion Pictures,2010,1,7,8,51,3
20,The Twilight Saga: Eclipse,2010 volvo xc60,3,volvo xc60,5,"$300,531,751",Summit Entertainment,2010,1,1,2,5,3
846,Cop Out,volvo xc70,2,volvo xc70,73,"$44,875,481",Warner Bros.,2010,0,3,3,73,2


In [40]:
# dropping duplicate values
final_df.drop_duplicates(subset ="Vehicle", keep='first', inplace=True)

In [42]:
# reset the index
final_df = final_df.reset_index(drop=True)
final_df

Unnamed: 0,Release,Car,Stars,Vehicle,Rank,Gross,Distributor,Year,Strong Appearance,Weak Appearance,Total Appearance,Best Rank,Highest Star
0,The Other Guys,2007 acura mdx,1,acura mdx,18,"$119,219,978",Sony Pictures Entertainment (SPE),2010,0,2,2,18,1
1,The Other Guys,1999 acura tl,1,acura tl,18,"$119,219,978",Sony Pictures Entertainment (SPE),2010,0,3,3,18,1
2,Jackass 3D,2009 acura tsx,1,acura tsx,21,"$117,052,883",Paramount Pictures,2010,0,1,1,21,1
3,Iron Man 2,2010 acura zdx,2,acura zdx,4,"$312,433,331",Paramount Pictures,2010,0,1,1,4,2
4,Letters to Juliet,2009 audi a3 cabriolet,2,audi a3,63,"$53,032,453",Summit Entertainment,2010,0,2,2,63,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,The Other Guys,1999 volvo s80,1,volvo s80,18,"$119,219,978",Sony Pictures Entertainment (SPE),2010,0,2,2,18,1
210,The Last Song,1997 volvo v70,3,volvo v70,51,"$62,950,384",Walt Disney Studios Motion Pictures,2010,1,7,8,51,3
211,The Twilight Saga: Eclipse,2010 volvo xc60,3,volvo xc60,5,"$300,531,751",Summit Entertainment,2010,1,1,2,5,3
212,Cop Out,volvo xc70,2,volvo xc70,73,"$44,875,481",Warner Bros.,2010,0,3,3,73,2


In [None]:
# create DataFrame from copy of sales data
car_sales_2010 = lower_cars[['Vehicle', '2009', '2010']]
car_sales_2010

In [None]:
# dropping the rows having NaN values
car_sales_2010 = car_sales_2010.dropna()

In [None]:
# create a % Change column
car_sales_2010['% Change'] = (car_sales_2010['2010'] - car_sales_2010['2009']) / car_sales_2010['2009']

In [None]:
# create a column that indicates either a positive or negative annual change in sales
change = car_sales_2010['% Change'].tolist()

change_list = []

for n in range(len(change)):
    if change[n] > 0:
        change_list.append('Positive')
    else:
        change_list.append('Negative')

car_sales_2010['Change'] = change_list

In [None]:
# reset index for car_sales_2010
car_sales_2010 = car_sales_2010.reset_index(drop=True)

In [None]:
# create list of cars from the car in movies Vehicle column
veh_2010 = clean_car_mov.Vehicle.unique().tolist()

In [None]:
# create a dummy list to fill with 'vehicles' data, for car sales
carsales_list = [0] * len(car_sales_2010)

In [None]:
# reconstruct dummy list to contain the reference string for each Vehicle element
for vehicle in veh_2010:
    for x in range(len(car_sales_2010)):
        if vehicle in car_sales_2010.Vehicle[x]:
            carsales_list[x] = vehicle

In [None]:
# add column with vehicle reference strings to use for merging DataFrames
car_sales_2010['Movie Presence'] = carsales_list

In [None]:
# rename columns for merging
cars_2010_movie.rename(columns = {'Vehicle': 'Vehicle_og', 'Vehicle ref': 'Vehicle'}, inplace = True)

# create a DataFrame that merges the clean cars in movie data with the car sales calculations
car_mov_sales = pd.merge(clean_car_mov, cars_2010_movie, how="inner", on="Vehicle")
car_mov_sales

### Begin combining car sales, cars in movies, and movie sales data together

In [None]:
# read in film data
movies = pd.read_csv('output_data/Top100_Movies_2010-2021.csv')

In [None]:
# create a DataFrame of only films from the top 2010 grossing films
df10 = movies.loc[movies.Year == 2010]
df10

In [None]:
# create a DataFrame that merges the car and movie data
car_mov = pd.merge(cars_filtered, df10, how="left", on="Release")
car_mov

In [None]:
# create a clean DataFrame that only keeps relevant columns
clean_car_mov = car_mov[['Release', 'Car', 'Stars', 'Rank', 'Gross', 'Year_y']]
# rename the column for Year_y
clean_car_mov.rename(columns = {'Year_y':'Year'}, inplace = True)

In [None]:
# display data for cars made in 2009 or 2010 with a strong screen presence
clean_car_mov.loc[clean_car_mov.Car.str.contains('2009|2010') & (clean_car_mov.Stars >= 3)]

### Calculations and visualizations created

In [None]:
# create list from the Car column
cars_list = clean_car_mov.Car.tolist()

In [None]:
# create a dummy list to fill with 'vehicles' data
vehicles_list = np.arange(0, 1155, 1).tolist()

In [None]:
# reconstruct dummy list to contain the reference string for each Car element
for vehicle in vehicles:
    for x in range(len(cars_list)):
        if vehicle in cars_list[x]:
            vehicles_list[x] = vehicle

In [None]:
clean_car_mov['Vehicle'] = vehicles_list
clean_car_mov = clean_car_mov[['Release', 'Rank', 'Gross', 'Car', 'Vehicle', 'Stars', 'Year']]
clean_car_mov

In [None]:
# create DataFrame from copy of sales data
car_sales_2010 = lower_cars[['Vehicle', '2009', '2010']]
car_sales_2010

In [None]:
# dropping the rows having NaN values
car_sales_2010 = car_sales_2010.dropna()

In [None]:
# create a % Change column
car_sales_2010['% Change'] = (car_sales_2010['2010'] - car_sales_2010['2009']) / car_sales_2010['2009']

In [None]:
# create a column that indicates either a positive or negative annual change in sales
change = car_sales_2010['% Change'].tolist()

change_list = []

for n in range(len(change)):
    if change[n] > 0:
        change_list.append('Positive')
    else:
        change_list.append('Negative')

car_sales_2010['Change'] = change_list

In [None]:
car_sales_2010

In [None]:
# generate a pie plot showing the distribution of positive to negative change in sales volume
sales_count = car_sales_2010.Change.value_counts()
sales_list = sales_count.keys()
sales_plot = sales_count.plot(kind='pie', y=sales_list, autopct="%1.1f%%", figsize=(6,6))
plt.title('Positive vs Negative % Annual Change, for Total Cars 2010')
plt.show()

In [None]:
# reset index for car_sales_2010
car_sales_2010 = car_sales_2010.reset_index(drop=True)

In [None]:
# create list of cars from the car in movies Vehicle column
veh_2010 = clean_car_mov.Vehicle.unique().tolist()

In [None]:
# create a dummy list to fill with 'vehicles' data, for car sales
carsales_list = [0] * 271

In [None]:
# reconstruct dummy list to contain the reference string for each Vehicle element
for vehicle in veh_2010:
    for x in range(len(car_sales_2010)):
        if vehicle in car_sales_2010.Vehicle[x]:
            carsales_list[x] = vehicle

In [None]:
print(carsales_list)

In [None]:
# add column with vehicle reference strings to use for merging DataFrames
car_sales_2010['Vehicle ref'] = carsales_list

# create two DataFrames: one with the cars in films, and one with cars not in films (i.e. Vehicle ref = 0)
cars_2010_movie = car_sales_2010.loc[car_sales_2010['Vehicle ref'] != 0]
cars_2010_no_movie = car_sales_2010.loc[car_sales_2010['Vehicle ref'] == 0]

In [None]:
cars_2010_movie

In [None]:
cars_2010_no_movie

In [None]:
# generate a pie plot showing the distribution of positive to negative change in sales volume
sales_count = cars_2010_movie.Change.value_counts()
sales_list = sales_count.keys()
sales_plot = sales_count.plot(kind='pie', y=sales_list, autopct="%1.1f%%", figsize=(6,6))
plt.title('Positive vs Negative % Annual Change, for Total Cars in Films 2010')
plt.show()

In [None]:
# generate a pie plot showing the distribution of positive to negative change in sales volume
sales_count = cars_2010_no_movie.Change.value_counts()
sales_list = sales_count.keys()
sales_plot = sales_count.plot(kind='pie', y=sales_list, autopct="%1.1f%%", figsize=(6,6))
plt.title('Positive vs Negative % Annual Change, for Total Cars not in Films 2010')
plt.show()

In [None]:
# rename columns for merging
cars_2010_movie.rename(columns = {'Vehicle': 'Vehicle_og', 'Vehicle ref': 'Vehicle'}, inplace = True)

# create a DataFrame that merges the clean cars in movie data with the car sales calculations
car_mov_sales = pd.merge(clean_car_mov, cars_2010_movie, how="inner", on="Vehicle")
car_mov_sales