In [86]:
from googleapiclient.discovery import build
import pandas as pd
pd.set_option('display.max_rows', 500)
api_key = 'ENTER API KEY'
youtube = build('youtube', 'v3', developerKey=api_key)

In [87]:
year_list = ['2022', '2023']
segmentdf = pd.read_excel('EV_VehicleSegments.xlsx')
segmentdf.drop('Unnamed: 0', axis = 1, inplace = True)

In [88]:
seg2 = segmentdf

In [89]:
seg2['Make'].unique()

array(['Audi', 'Bmw', 'Cadillac', 'Genesis', 'Jaguar', 'Lucid',
       'Mercedes-benz', 'Polestar', 'Porsche', 'Rivian', 'Tesla', 'Volvo',
       'Chevrolet', 'Ford', 'Gmc', 'Hyundai', 'Kia', 'Mazda', 'Mini',
       'Nissan', 'Subaru', 'Toyota', 'Volkswagen'], dtype=object)

In [227]:
#Youtube API per account cannot handle all models to be scraped at once. So they need to be segregated as shown below. 
#Safest way is to do it a few at a time and then switch api key to different accounts.
#Use below code to systematically generate files as per make and then merge them in the end. 
segloc = seg2.loc[(seg2['Make'] == "Nissan") | (seg2['Make'] == "Tesla")]

In [228]:
query_list = []
for i in year_list:
    for j in segloc["Model"]:
        query = segloc.loc[segloc["Model"] == j]["Make"].tolist()[0] + ' ' + j + ' ' + str(i)
        query_list.append(query)

In [229]:
query_list

['Tesla Model 3 2022',
 'Tesla Model s 2022',
 'Tesla Model x 2022',
 'Tesla Model y 2022',
 'Nissan Ariya 2022',
 'Nissan Leaf 2022',
 'Tesla Model 3 2023',
 'Tesla Model s 2023',
 'Tesla Model x 2023',
 'Tesla Model y 2023',
 'Nissan Ariya 2023',
 'Nissan Leaf 2023']

In [230]:
len(query_list)

12

In [233]:
#Can be used to get the list of URLs to be scraped - Choose ones with maximum comments to maintain efficiency of queries. 
#But this process is incredibly intensive on the quota and has to be used sparingly. 
#Hence a combination of this and manual process of searching and grabbing URLs directly from youtube were used. 
#Scaling this requires permission from Google which can be done by 
#the organization if required through increasing Quota limits substantially. 10000 points per day is inadequate. 
#Only 2022 and 2023 will be considered here due to query limits
urls = []
for i in query_list:
    search_response = youtube.search().list(q=i, part='id,snippet', maxResults=8).execute()
    video_ids = [search_result['id']['videoId'] for search_result in search_response.get('items', [])]
    video_response = youtube.videos().list(id=','.join(video_ids), part='id,snippet,statistics').execute()
    try:
        for video_result in video_response.get('items', []):
            title = video_result['snippet']['title']
            url = f'https://www.youtube.com/watch?v={video_result["id"]}'
            video_id = video_result["id"]
            view_count = video_result['statistics']['viewCount']
            comment_count = video_result['statistics']['commentCount']
            urls.append((i, title, url, video_id, view_count, comment_count))
    except:
        print(i)
        continue
yturl_df = pd.DataFrame(urls, columns = ['Query', 'Title', 'URL', 'VideoID', 'ViewCount', 'Comment_Count'])

Nissan Ariya 2022
Tesla Model s 2023


In [234]:
yturl_df['Comment_Count'] = yturl_df['Comment_Count'].astype(int)
#Taking only the top result by comments. Due to a large number of vehicles over 2 years, this needs to be done to stay within query limits
yturl_df_top = yturl_df.groupby('Query').apply(lambda x: x.nlargest(8, 'Comment_Count')).reset_index(drop=True)
#Removing duplicate results - Sometimes same results come up as top results for similar queries - Unpopular vehicles for both year queries for instance
#Needs to be done to avoid duplicating scraping. 
yturl_df_top = yturl_df_top.drop(yturl_df_top[yturl_df_top.drop('Query', axis = 1).duplicated()].index)

In [235]:
yturl_df_top

Unnamed: 0,Query,Title,URL,VideoID,ViewCount,Comment_Count
0,Nissan Ariya 2022,Nissan Ariya review - it changes everything!,https://www.youtube.com/watch?v=hbykcnRP3Rc,hbykcnRP3Rc,1582452,2265
1,Nissan Ariya 2022,The 2023 Nissan Ariya Is a Quirky New Electric...,https://www.youtube.com/watch?v=M4Jbvss-EOc,M4Jbvss-EOc,396810,1455
2,Nissan Ariya 2022,2023 Nissan ARIYA - POV First Impressions,https://www.youtube.com/watch?v=oIkETEaiHAo,oIkETEaiHAo,288725,391
4,Nissan Ariya 2023,Is the New 2023 Nissan Ariya a better SUV than...,https://www.youtube.com/watch?v=G4dtSpUpNec,G4dtSpUpNec,148369,393
5,Nissan Ariya 2023,The 2023 Nissan Ariya e-4ORCE Is A GTR-Inspire...,https://www.youtube.com/watch?v=ryI_c16-6pQ,ryI_c16-6pQ,87542,319
6,Nissan Ariya 2023,Unexpected Results! The 2023 Nissan Ariya Take...,https://www.youtube.com/watch?v=N9YRD6hWEyc,N9YRD6hWEyc,26286,251
7,Nissan Ariya 2023,2023 Nissan Ariya e-4orce AWD | MotorWeek Firs...,https://www.youtube.com/watch?v=N-geV50lzIQ,N-geV50lzIQ,17776,111
8,Nissan Ariya 2023,2023 Nissan Ariya Platinum+ e-4orce: Better Th...,https://www.youtube.com/watch?v=lWi0s5KLLVE,lWi0s5KLLVE,1400,28
9,Nissan Ariya 2023,2023 NISSAN ARIYA e-4ORCE- IS IT WORTH IT? FUL...,https://www.youtube.com/watch?v=LZXaoMz2ttI,LZXaoMz2ttI,1583,15
10,Nissan Ariya 2023,2023 Nissan Ariya Review: First Drive,https://www.youtube.com/watch?v=or15lXKoPWI,or15lXKoPWI,5541,4


In [236]:
replies = []
comments = []
#commcount = 0
for vid in yturl_df_top['VideoID']:
    video_response=youtube.commentThreads().list(part='snippet,replies',videoId=vid).execute()
    model = yturl_df_top.loc[yturl_df_top["VideoID"] == vid]["Query"].tolist()[0]
    #for q, w in video_ids_dict.items():
    #    if vid in [p for p in w]:
    #        model = q
    url = 'https://www.youtube.com/watch?v='+vid
    title = yturl_df_top.loc[yturl_df_top["VideoID"] == vid]["Title"].tolist()[0]
    #title_response = youtube.videos().list(part='snippet',id=vid).execute()
    #title = title_response['items'][0]['snippet']['title']
    while video_response:
        #extraction based on nested dictionary structure of the video_response object
        for item in video_response['items']:
            #Comment Extraction
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            #Replies of a comment
            replycount = item['snippet']['totalReplyCount']
            try:
                if replycount>0:
                    for reply in item['replies']['comments']:
                    # Extract reply
                        reply = reply['snippet']['textDisplay']
                        replies.append(reply)
            except:
                print('Reply Err')
 
            #Display Comments and Corresponding Replies
            if replies == list():
                replies = "None"
            #print(commcount)
            comments.append((model, comment, replies, title, url))
            #commcount = commcount + 1
            #print(comment)
            replies = []
 
            #"Pagination"/Show More 
        if 'nextPageToken' in video_response:
            video_response = youtube.commentThreads().list(part = 'snippet,replies',videoId = vid, 
                                                           pageToken = video_response['nextPageToken']).execute()
        else:
            break

In [237]:
df = pd.DataFrame(comments, columns = ['Model', 'Comments', 'Replies', 'Video Title', 'URL'])

In [238]:
df.to_csv('YoutubeComments_EV_Nissan_Tesla_Large.csv')

In [239]:
df

Unnamed: 0,Model,Comments,Replies,Video Title,URL
0,Nissan Ariya 2022,"<a href=""https://www.youtube.com/watch?v=hbykc...","[Actually, IMHO, all mentioned cars look bette...",Nissan Ariya review - it changes everything!,https://www.youtube.com/watch?v=hbykcnRP3Rc
1,Nissan Ariya 2022,More from this guy. Subscribed!,,Nissan Ariya review - it changes everything!,https://www.youtube.com/watch?v=hbykcnRP3Rc
2,Nissan Ariya 2022,Will this be available in south Africa,,Nissan Ariya review - it changes everything!,https://www.youtube.com/watch?v=hbykcnRP3Rc
3,Nissan Ariya 2022,And take off the big fluffy jacket while drivi...,,Nissan Ariya review - it changes everything!,https://www.youtube.com/watch?v=hbykcnRP3Rc
4,Nissan Ariya 2022,I&#39;m surprised at how you manage to make a ...,,Nissan Ariya review - it changes everything!,https://www.youtube.com/watch?v=hbykcnRP3Rc
...,...,...,...,...,...
13642,Tesla Model y 2023,shit man thas a huge pile of shit talk,,Tesla Model Y 2023 Update Is Here And SHOCKS E...,https://www.youtube.com/watch?v=m9rZ5WOZ9o0
13643,Tesla Model y 2023,Truck Electric?,,Tesla Model Y 2023 Update Is Here And SHOCKS E...,https://www.youtube.com/watch?v=m9rZ5WOZ9o0
13644,Tesla Model y 2023,I love it ❤❤🎉🎉🎉🎉🎉,,Tesla Model Y 2023 Update Is Here And SHOCKS E...,https://www.youtube.com/watch?v=m9rZ5WOZ9o0
13645,Tesla Model y 2023,We that live in northern Scandinavia for exemp...,,Tesla Model Y 2023 Update Is Here And SHOCKS E...,https://www.youtube.com/watch?v=m9rZ5WOZ9o0


In [244]:
#These files were generated one at a time. 
df1 = pd.read_csv('YoutubeComments_EV_First7.csv')
df2 = pd.read_csv('YoutubeComments_EV_8to14.csv')
df3 = pd.read_csv('YoutubeComments_EV_15to21.csv')
df4 = pd.read_csv('YoutubeComments_EV_Last2.csv')
#df5 = pd.read_csv('YoutubeComments_EV_Tesla.csv')
df6 = pd.read_csv('YoutubeComments_EV_Mini.csv')
df7 = pd.read_csv('YoutubeComments_EV_Porsche_Ford_Hyundai.csv')
df8 = pd.read_csv('YoutubeComments_EV_Nissan_Tesla_Large.csv')

df = pd.concat([df1, df2, df3, df4, df8, df6, df7]).reset_index()
df.drop(['index', 'Unnamed: 0'], axis = 1, inplace = True)
df = df.drop_duplicates()
df.to_csv('YoutubeComments_EV.csv')

In [245]:
df = pd.read_csv('YoutubeComments_EV.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)
df['Make'] = df['Model'].str.split().str.get(0)
#if len(df['Model'].str.split())
df['Year'] = df['Model'].str.split().str.get(-1)
#This is needed due to Models having 2 or more words in their name. The years are then removed.
df['Model'] = df['Model'].str.split().str[1:4].apply(lambda x: ' '.join(x))
df['Model'] = df['Model'].str.replace('2022', '')
df['Model'] = df['Model'].str.replace('2023', '')
df['Model'] = df['Model'].str.strip()
df2 = pd.read_excel('EV_VehicleSegments.xlsx')
df2.drop('Unnamed: 0', axis = 1, inplace = True)
df_full = pd.merge(df, df2, on = ["Make", "Model"])
df_full = df_full[['Segment', 'Make', 'Model', 'Year', 'Comments', 'Replies', 'Video Title', 'URL']]
df_full['Word Count'] = df_full['Comments'].str.split(' ').str.len()

In [247]:
df_full.to_csv('YoutubeCommentsFull_EV.csv')

In [246]:
df_full['Make'].value_counts()

Lucid            20664
Mercedes-benz    19296
Bmw              17372
Ford             14799
Rivian           13115
Tesla             9873
Hyundai           9070
Audi              9000
Gmc               8608
Kia               8376
Polestar          5913
Genesis           5362
Cadillac          5119
Mazda             4689
Nissan            4329
Toyota            3905
Chevrolet         3124
Subaru            2772
Volvo             2603
Jaguar             843
Volkswagen         626
Porsche            430
Mini               417
Name: Make, dtype: int64

In [116]:
dfe = pd.read_csv('Edmunds_CustomerReviews.csv', encoding = "utf-16")

In [126]:
l = dfe['Review'].str.split().str.len().tolist()
avg_edmunds_reviewlength = sum(l)/len(l)

In [248]:
df_final = df_full.loc[df_full["Word Count"] > 40].reset_index()
df_final.drop('index', axis = 1, inplace = True)
df_final = df_final.drop_duplicates()
df_final.to_csv('YoutubeComments_40_InfoRich_EV.csv')

In [251]:
df_final

Unnamed: 0,Segment,Make,Model,Year,Comments,Replies,Video Title,URL,Word Count
0,Luxury,Audi,E-tron,2022,Pro-Clip USA are garbage. They claim they don’...,,The 2022 Audi Q4 E-Tron Is Audi’s New Tesla-Fi...,https://www.youtube.com/watch?v=8XJeVK0_iM8,60
1,Luxury,Audi,E-tron,2022,"Drove the Q4, MB EBQ, VW ID4 &amp; LexusRZ al...",,The 2022 Audi Q4 E-Tron Is Audi’s New Tesla-Fi...,https://www.youtube.com/watch?v=8XJeVK0_iM8,57
2,Luxury,Audi,E-tron,2022,"Audi&#39;s EV&#39;s look great, but have falle...",,The 2022 Audi Q4 E-Tron Is Audi’s New Tesla-Fi...,https://www.youtube.com/watch?v=8XJeVK0_iM8,114
3,Luxury,Audi,E-tron,2022,38 minutes from 5% to 80% is slow. The Kia and...,,The 2022 Audi Q4 E-Tron Is Audi’s New Tesla-Fi...,https://www.youtube.com/watch?v=8XJeVK0_iM8,48
4,Luxury,Audi,E-tron,2022,I normally find that sponsor placement to be a...,,The 2022 Audi Q4 E-Tron Is Audi’s New Tesla-Fi...,https://www.youtube.com/watch?v=8XJeVK0_iM8,93
...,...,...,...,...,...,...,...,...,...
17716,Non-Luxury,Volkswagen,Id4,2023,Just want to point out that VWs have historica...,,2023 VW ID4 AWD Pro S: Is The ID.4 A Good EV?,https://www.youtube.com/watch?v=nAwUkLp7T9A,96
17717,Non-Luxury,Volkswagen,Id4,2023,Come on Autobytel. This model tested is not a ...,['This fawning review does not describe the 20...,2023 VW ID 4 Review,https://www.youtube.com/watch?v=mtejBVOYhFY,54
17718,Non-Luxury,Volkswagen,Id4,2023,The title of the video says 2023 but the vehic...,,2023 VW ID 4 Review,https://www.youtube.com/watch?v=mtejBVOYhFY,91
17719,Non-Luxury,Volkswagen,Id4,2023,From a personal perspective this is a horrible...,['Given that this is made in America and quali...,2023 VW ID 4 Review,https://www.youtube.com/watch?v=mtejBVOYhFY,46


In [250]:
df_final['Make'].value_counts()

Lucid            1951
Ford             1733
Rivian           1550
Tesla            1333
Bmw              1252
Mercedes-benz    1211
Hyundai          1013
Kia               944
Audi              757
Mazda             737
Polestar          735
Gmc               672
Genesis           629
Cadillac          629
Nissan            581
Toyota            467
Subaru            412
Chevrolet         401
Volvo             310
Jaguar            179
Volkswagen         84
Porsche            72
Mini               69
Name: Make, dtype: int64