In [68]:
import pandas as pd
import requests
import re
import time
import random
import json

In [69]:
from IPython.display import Image, display

In [70]:
pd.options.display.max_colwidth = None

In [71]:
df = pd.read_csv('../06_app/app_files/final_results_df.csv')

In [72]:
brand_models_df = df[['brand','model']].value_counts(sort=True).reset_index()

In [73]:
brand_models_df

Unnamed: 0,brand,model,count
0,volkswagen,golf,1051
1,bmw,serie 3,708
2,seat,leon,666
3,seat,ibiza,563
4,audi,a3,530
...,...,...,...
1197,lotus,emira,1
1198,lotus,elise,1
1199,ferrari,599,1
1200,ferrari,612,1


In [74]:
# Function to recursively flatten dictionaries
def flatten_column(df, col):
    df_flat = pd.json_normalize(df[col])
    df_flat.columns = [f"{col}.{subcol}" for subcol in df_flat.columns]  # Prefix nested column names
    return pd.concat([df.drop(columns=[col]), df_flat], axis=1)

In [75]:
# Function to extract author name
def extract_author(snippet):
    author_pattern = re.search(r'Author\s([\w\s]+)|author name string: ([\w\s]+)', snippet)
    if author_pattern:
        return author_pattern.group(1) or author_pattern.group(2)
    return 'Unknown'

In [76]:
# Function to check for Creative Commons license
def check_license(snippet):
    if 'Creative Commons Attribution' in snippet or 'CC BY-SA' in snippet:
        return 'CC BY-SA 4.0'
    return 'Unknown License'

In [77]:
for brand, model in zip(brand_models_df['brand'][:10], brand_models_df['model'][:10]):
    print(brand, model)

volkswagen golf
bmw serie 3
seat leon
seat ibiza
audi a3
mercedes-benz clase a
ford focus
mercedes-benz clase c
bmw serie 1
renault megane


In [82]:
# Load the config.json file
with open('config.json') as config_file:
    config = json.load(config_file)

all_brand_models = []

# Loop over the brands and models
for brand, model in zip(brand_models_df['brand'], brand_models_df['model']):
  
    # Adjust the brand and model names for URL encoding
    brand_adjusted = brand.replace(' ', '%20')
    model_adjusted = model.replace(' ', '%20')
  
    try:
        # Use the URL template from the config and format it with adjusted brand and model
        url = config['url_template'].format(brand_adjusted=brand_adjusted, model_adjusted=model_adjusted)

        # Define headers and payload from the config
        headers = config['headers']
        payload = config['payload']

        # Send the request
        response = requests.request("GET", url, headers=headers, data=payload)

        # Check if the request was successful
        if response.status_code == 200:
            df = pd.DataFrame(response.json()['query']['pages']).transpose().reset_index(drop=True)

            # Recursively flattening 'details' column
            df_flat = flatten_column(df, 'imageinfo')
            df_flat = flatten_column(df_flat, 'imageinfo.0')  # Flattening nested 'specs'

            df_final = df_flat[['title', 'snippet', 'timestamp', 'index', 'fullurl', 'imageinfo.0.thumburl', 'imageinfo.0.url']].set_index('index').sort_index()

            # Apply the functions to create new columns
            df_final['author'] = df_final['snippet'].apply(extract_author)
            df_final['license'] = df_final['snippet'].apply(check_license)

            # Add brand and model columns
            df_final['brand'] = brand
            df_final['model'] = model

            df_ready = df_final
        else:
            df_ready = pd.DataFrame()

    except Exception as e:
        print(f"There was an error with {brand} {model}. This model is skipped. Error: {e}")
        df_ready = pd.DataFrame()  # Ensure df_ready is defined even in case of error

    # Check if df_ready is not empty
    if not df_ready.empty:
        all_brand_models.append(df_ready)

    # Introduce delay between requests
    delay = random.uniform(2, 3)
    print(f"Request for {brand} {model} completed. Waiting {delay:.2f} seconds before the next request.")
    time.sleep(delay)

print('All requests completed')

Request for nissan nv300 completed. Waiting 2.52 seconds before the next request.
Request for audi q8 sportback e-tron completed. Waiting 2.98 seconds before the next request.
Request for citroen ax completed. Waiting 2.47 seconds before the next request.
Request for peugeot e-traveller completed. Waiting 2.19 seconds before the next request.
Request for renault mascott completed. Waiting 2.86 seconds before the next request.
Request for renault vel satis completed. Waiting 2.75 seconds before the next request.
Request for isuzu trooper completed. Waiting 2.92 seconds before the next request.
Request for peugeot e-3008 completed. Waiting 2.09 seconds before the next request.
Request for citroen ami completed. Waiting 2.06 seconds before the next request.
Request for audi rs7 completed. Waiting 2.36 seconds before the next request.
Request for lexus gs450h completed. Waiting 2.68 seconds before the next request.
Request for infiniti qx30 completed. Waiting 2.47 seconds before the next r

In [83]:
df = pd.concat(all_brand_models)

In [84]:
df.to_csv('../06_app/app_files/car_pictures_table_all.csv', index=False)

In [85]:
df = pd.read_csv('../06_app/app_files/car_pictures_table_all.csv')

In [92]:
# df_all.to_csv('car_pictures_table_all.csv', index=False)

In [93]:
# df_all['author'].value_counts().head(50)

author
Unknown                                      21638
Dinkun Chen Wikimedia username                3219
Alexander                                     1793
Rudolf Stricker Permission                     823
Alexander Migl                                 687
Calreyn88                                      526
Thomas doerfer Other versions                  429
Dinkun                                         400
Tokumeigakarinoaoshima                         368
Benespit Wikimedia username                    277
Dinkun Chen Wikimedia                          273
Dinkun Chen                                    254
Chu Other versions                             253
M 93 Other versions                            246
order_242                                      210
Thomas doerfer                                 196
Tennen                                         192
Charles01                                      189
Calreyn88 Wikimedia username                   176
MercurySable99 Wikimedia

In [45]:
# selected_picture = df[df['author'] != 'Unknown'].reset_index(drop=True).iloc[[0],:]

In [46]:
# selected_picture

Unnamed: 0,title,snippet,timestamp,fullurl,imageinfo.0.thumburl,imageinfo.0.url,author,license,brand,model
0,File:Volkswagen Golf VIII R 1X7A7089.jpg,"Description<span class=""searchmatch"">Volkswagen</span> <span class=""searchmatch"">Golf</span> VIII R 1X7A7089.jpg <span class=""searchmatch"">Volkswagen</span> <span class=""searchmatch"">Golf</span> VIII R in Stuttgart-Vaihingen Date 7 April 2023 Source Own work Author Alexander-93",2023-04-07T14:58:56Z,https://commons.wikimedia.org/wiki/File:Volkswagen_Golf_VIII_R_1X7A7089.jpg,https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Volkswagen_Golf_VIII_R_1X7A7089.jpg/332px-Volkswagen_Golf_VIII_R_1X7A7089.jpg,https://upload.wikimedia.org/wikipedia/commons/4/4f/Volkswagen_Golf_VIII_R_1X7A7089.jpg,Alexander,Unknown License,volkswagen,golf


In [44]:
# selected_picture['imageinfo.0.thumburl'][0]

'https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Volkswagen_Golf_VIII_R_1X7A7089.jpg/332px-Volkswagen_Golf_VIII_R_1X7A7089.jpg'

In [47]:
# image_url = selected_picture['imageinfo.0.thumburl'][0]

In [48]:
# display(Image(url=image_url))

In [50]:
# df_all = pd.read_parquet('../04_EDA/coches_net_model.parquet')

In [56]:
# df_all[['brand','model', 'year']].value_counts().reset_index().head(50)

Unnamed: 0,brand,model,year,count
0,mercedes-benz,clase a,2023,456
1,volkswagen,golf,2019,455
2,volkswagen,golf,2021,376
3,volkswagen,golf,2020,360
4,mercedes-benz,clase a,2020,352
5,mercedes-benz,clase a,2019,344
6,fiat,500,2022,342
7,peugeot,3008,2019,332
8,hyundai,tucson,2022,327
9,seat,leon,2019,323
