## Setup dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q pycld2

In [None]:
import pandas as pd
import numpy as np
import pycld2
import re
import requests
from datetime import datetime

In [None]:
train = pd.read_csv('/content/drive/MyDrive/test.csv')
train

Unnamed: 0,id,date,likes,content,username,media,inferred company
0,5,2018-10-19 14:30:46,41,Congratulations to Pauletha Butts of <mention>...,BGISD,[Photo(previewUrl='https://pbs.twimg.com/media...,independent
1,12,2020-11-18 00:07:02,0,#NetZeroHomes: learn the best way to get to ze...,MitsubishiHVAC,[Photo(previewUrl='https://pbs.twimg.com/media...,trane
2,23,2018-09-11 14:23:02,120,"Tournaments come and go, but the pictures are ...",GettySport,[Photo(previewUrl='https://pbs.twimg.com/media...,getty images
3,31,2020-11-10 03:00:26,567,Monster Hunter Rise Concept Art: Great Izuchi....,MHinfo_en,[Photo(previewUrl='https://pbs.twimg.com/media...,monster
4,32,2018-11-29 22:11:56,1,"With our #SalesDevelopmentProgram, you'll lear...",UnitedRentals,[Photo(previewUrl='https://pbs.twimg.com/media...,united rentals
...,...,...,...,...,...,...,...
59326,299981,2019-11-30 12:40:07,0,Real Madrid Consider Huge Swap Deal For Arsena...,IndependentNGR,[Photo(previewUrl='https://pbs.twimg.com/media...,independent
59327,299982,2018-07-22 13:08:00,144,The founding of this <mention> club and the ro...,Rotary,[Video(thumbnailUrl='https://pbs.twimg.com/amp...,rotary international
59328,299986,2019-10-09 17:59:33,0,"NDDC Still Implementing 2018 Budget As FG, Oth...",IndependentNGR,[Photo(previewUrl='https://pbs.twimg.com/media...,independent
59329,299991,2020-12-17 19:45:01,174,I'm incredibly pumped for #XFL alum <mention>\...,PatMcAfeeShow,[Video(thumbnailUrl='https://pbs.twimg.com/amp...,mcafee


In [None]:
column = ['text_status', 'text', 'languages', 'image_url_status', 'image_url', 'video_thumbnail_status', 'video_thumbnail_url', 'video_url_status', 'video_url', 'video_bitrate', 'video_duration', 'video_views', 'gif_thumbnail_status', 'gif_thumbnail_url', 'gif_url_status', 'gif_url', 'gif_bitrate', 'year', 'month', 'day_of_year', 'day_of_week', 'date', 'username', 'inferred company', 'likes']
eda = pd.DataFrame(columns = column)

## Setup funtions

In [None]:
def detect_languages(text):
  langs = []
  _, _, details = pycld2.detect(text)
  for lang in details:
    # check confidence of each language before reporting (atleast 50%)
    if(lang[2]>=50):
      langs.append(lang[0])
  return langs if langs!=[] else ['Unknown']

In [None]:
def check_link(url):
  try:
    response = requests.head(url)
    if response.status_code >= 200 and response.status_code < 400:
      return ("Link exists and is accessible")
    else:
      return ("Link exists but is not accessible")
  except requests.ConnectionError:
    return ("Error while accessing the link")

In [None]:
def time(datetime_str):
  dt_object = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")

  year = dt_object.year
  month = dt_object.strftime("%B")
  day_of_year = dt_object.timetuple().tm_yday
  day_of_week = dt_object.strftime('%A')

  return [year, month, day_of_year, day_of_week]

In [None]:
def photo_gif_param(input_string):
    photo_pattern = r"Photo\(previewUrl='(.*?)', fullUrl='(.*?)'\)"
    gif_pattern = r"Gif\(thumbnailUrl='(.*?)', variants=\[VideoVariant\(contentType='video/mp4', url='(.*?)', bitrate=(\d+)\)\]\)"

    photos = re.findall(photo_pattern, input_string)
    gifs = re.findall(gif_pattern, input_string)

    photos_df = pd.DataFrame(photos, columns=['photo_preview_url', 'photo_full_url'])
    gifs_df = pd.DataFrame(gifs, columns=['gif_thumbnail_url', 'gif_url', 'gif_bitrate'])

    return [photos_df, gifs_df]

def video_param(input_string):
    # Initialize variables to track the minimum bitrate and its corresponding video URL
    min_bitrate = float('inf')
    min_bitrate_video_url = None

    thumbnail_url_match = re.search(r"thumbnailUrl='(.*?)'", input_string)
    if thumbnail_url_match:
        thumbnail_url = thumbnail_url_match.group(1)

    video_urls_and_bitrates = re.findall(r"url='(.*?)', bitrate=(\d+|None)", input_string)
    for video_url, bitrate_str in video_urls_and_bitrates:
        bitrate = int(bitrate_str) if bitrate_str != "None" else float('inf')
        if bitrate < min_bitrate:
            min_bitrate = bitrate
            min_bitrate_video_url = video_url

    duration_match = re.search(r"duration=(\d+\.\d+)", input_string)
    if duration_match:
        duration = float(duration_match.group(1))
    else:
        duration = "Does not exist"

    views_match = re.search(r"views=(\d+)", input_string)
    if views_match:
        views = int(views_match.group(1))
    else:
        views = "Does not exist"

    return [thumbnail_url, min_bitrate_video_url, min_bitrate, duration, views]

## Analyse the dataset

In [None]:
# all tweets have some text
train['content'].isna().sum()

0

In [None]:
dataset1 = pd.read_csv('/content/drive/MyDrive/val_full.csv')
# dataset1 = pd.DataFrame(columns = column)

In [None]:
dataset1

Unnamed: 0,text_status,text,languages,image_url_status,image_url,video_thumbnail_status,video_thumbnail_url,video_url_status,video_url,video_bitrate,...,gif_url,gif_bitrate,year,month,day_of_year,day_of_week,date,username,inferred company,likes
0,Has text,Congratulations to Pauletha Butts of <mention>...,['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/Dp4L0cSUcAAh9JG?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2018,October,292,Friday,2018-10-19 14:30:46,BGISD,independent,41
1,Has text,#NetZeroHomes: learn the best way to get to ze...,['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/EnEIA1NXcAQIJw8?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2020,November,323,Wednesday,2020-11-18 00:07:02,MitsubishiHVAC,trane,0
2,Has text,"Tournaments come and go, but the pictures are ...",['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/Dm0dpR7X0AEHCvO?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2018,September,254,Tuesday,2018-09-11 14:23:02,GettySport,getty images,120
3,Has text,Monster Hunter Rise Concept Art: Great Izuchi....,['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/Embi-1aUYAEK4hb?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2020,November,315,Tuesday,2020-11-10 03:00:26,MHinfo_en,monster,567
4,Has text,"With our #SalesDevelopmentProgram, you'll lear...",['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/DtM-hi2UwAAWH2i?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2018,November,333,Thursday,2018-11-29 22:11:56,UnitedRentals,united rentals,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59326,Has text,Real Madrid Consider Huge Swap Deal For Arsena...,['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/EKnxk0iU4AEiUYC?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2019,November,334,Saturday,2019-11-30 12:40:07,IndependentNGR,independent,0
59327,Has text,The founding of this <mention> club and the ro...,['ENGLISH'],Link does not exist,Image does not exist,Link exists and is accessible,https://pbs.twimg.com/amplify_video_thumb/1009...,Link exists and is accessible,https://video.twimg.com/amplify_video/10092287...,320000,...,GIF does not exist,Does not exist,2018,July,203,Sunday,2018-07-22 13:08:00,Rotary,rotary international,144
59328,Has text,"NDDC Still Implementing 2018 Budget As FG, Oth...",['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/EGdIA8TVAAEPa5N?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2019,October,282,Wednesday,2019-10-09 17:59:33,IndependentNGR,independent,0
59329,Has text,I'm incredibly pumped for #XFL alum <mention>\...,['ENGLISH'],Link does not exist,Image does not exist,Link exists and is accessible,https://pbs.twimg.com/amplify_video_thumb/1339...,Link exists and is accessible,https://video.twimg.com/amplify_video/13396572...,288000,...,GIF does not exist,Does not exist,2020,December,352,Thursday,2020-12-17 19:45:01,PatMcAfeeShow,mcafee,174


In [None]:
l = len(train)
m = 59201

for i in range(m,l):
    eda.at[i-m, 'languages'] = detect_languages(train.at[i,'content'])

    dt = time(train.at[i,'date'])
    eda.at[i-m, 'year'] = dt[0]
    eda.at[i-m, 'month'] = dt[1]
    eda.at[i-m, 'day_of_year'] = dt[2]
    eda.at[i-m, 'day_of_week'] = dt[3]

    eda.at[i-m,'text'] = train.at[i,'content']
    eda.at[i-m,'date'] = train.at[i,'date']
    eda.at[i-m,'username'] = train.at[i,'username']
    eda.at[i-m,'inferred company'] = train.at[i,'inferred company']
    eda.at[i-m,'likes'] = train.at[i,'likes']

    img_gif = photo_gif_param(train.at[i,'media'])
    if(img_gif[0].empty and img_gif[1].empty):
        vid = video_param(train.at[i, 'media'])

        eda.at[i-m, 'video_thumbnail_url'] = vid[0]
        eda.at[i-m, 'video_url'] = vid[1]
        eda.at[i-m, 'video_thumbnail_status'] = check_link(vid[0]) if vid[0]!="" else "Link does not exist"
        eda.at[i-m, 'video_url_status'] = check_link(vid[1]) if vid[1]!="" else "Link does not exist"
        eda.at[i-m, 'video_bitrate'] = vid[2] if vid[2]!=0 else "Does not exist"
        eda.at[i-m, 'video_duration'] = vid[3] if vid[3]!=0.0 else "Does not exist"
        eda.at[i-m, 'video_views'] = vid[4] if vid[4]!=0 else "Does not exist"

        eda.at[i-m, 'image_url_status'] = "Link does not exist"
        eda.at[i-m, 'image_url'] = "Image does not exist"
        eda.at[i-m, 'gif_thumbnail_status'] = "Link does not exist"
        eda.at[i-m, 'gif_thumbnail_url'] = "GIF thumbnail does not exist"
        eda.at[i-m, 'gif_url_status'] = "Link does not exist"
        eda.at[i-m, 'gif_url'] = "GIF does not exist"
        eda.at[i-m, 'gif_bitrate'] = "Does not exist"

    else:
        eda.at[i-m, 'video_thumbnail_status'] = "Link does not exist"
        eda.at[i-m, 'video_url_status'] = "Link does not exist"
        eda.at[i-m, 'video_thumbnail_url'] = "Video thumbnail does not exist"
        eda.at[i-m, 'video_url'] = "Video does not exist"
        eda.at[i-m, 'video_bitrate'] = "Does not exist"
        eda.at[i-m, 'video_duration'] = "Does not exist"
        eda.at[i-m, 'video_views'] = "Does not exist"

        if(img_gif[0].empty):
          eda.at[i-m, 'image_url_status'] = "Link does not exist"
          eda.at[i-m, 'image_url'] = "Image does not exist"

          eda.at[i-m, 'gif_thumbnail_url'] = img_gif[1]['gif_thumbnail_url'].values[0]
          eda.at[i-m, 'gif_url'] = img_gif[1]['gif_url'].values[0]
          eda.at[i-m, 'gif_thumbnail_status'] = check_link(img_gif[1]['gif_thumbnail_url'].values[0])
          eda.at[i-m, 'gif_url_status'] = check_link(img_gif[1]['gif_url'].values[0])
          eda.at[i-m, 'gif_bitrate'] = img_gif[1]['gif_bitrate']

        else:
          eda.at[i-m, 'image_url_status'] = check_link(img_gif[0]['photo_preview_url'].values[0])
          eda.at[i-m, 'image_url'] = img_gif[0]['photo_preview_url'].values[0]

          eda.at[i-m, 'gif_thumbnail_status'] = "Link does not exist"
          eda.at[i-m, 'gif_thumbnail_url'] = "GIF thumbnail does not exist"
          eda.at[i-m, 'gif_url_status'] = "Link does not exist"
          eda.at[i-m, 'gif_url'] = "GIF does not exist"
          eda.at[i-m, 'gif_bitrate'] = "Does not exist"

    if(i%10==0):
        print((i),'generated')
        eda['text_status'] = "Has text"
        # dataset1 = eda
        dataset1 = pd.concat([dataset1, eda], ignore_index=True)
        # dataset1.to_csv('/content/drive/MyDrive/train_full.csv', index=False)
        # dataset1.to_csv('/content/drive/MyDrive/val_full.csv', index=False)
        dataset1.to_csv('/content/drive/MyDrive/test_full.csv', index=False)

        column = ['text_status', 'text', 'languages', 'image_url_status', 'image_url', 'video_thumbnail_status', 'video_thumbnail_url', 'video_url_status', 'video_url', 'video_bitrate', 'video_duration', 'video_views', 'gif_thumbnail_status', 'gif_thumbnail_url', 'gif_url_status', 'gif_url', 'gif_bitrate', 'year', 'month', 'day_of_year', 'day_of_week', 'date', 'username', 'inferred company', 'likes']
        eda = pd.DataFrame(columns = column)

59210 generated
59220 generated
59230 generated
59240 generated
59250 generated
59260 generated
59270 generated
59280 generated
59290 generated
59300 generated
59310 generated
59320 generated
59330 generated


In [None]:
dataset1

Unnamed: 0,text_status,text,languages,image_url_status,image_url,video_thumbnail_status,video_thumbnail_url,video_url_status,video_url,video_bitrate,...,gif_url,gif_bitrate,year,month,day_of_year,day_of_week,date,username,inferred company,likes
0,Has text,Congratulations to Pauletha Butts of <mention>...,[ENGLISH],Link exists and is accessible,https://pbs.twimg.com/media/Dp4L0cSUcAAh9JG?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2018,October,292,Friday,2018-10-19 14:30:46,BGISD,independent,41
1,Has text,#NetZeroHomes: learn the best way to get to ze...,[ENGLISH],Link exists and is accessible,https://pbs.twimg.com/media/EnEIA1NXcAQIJw8?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2020,November,323,Wednesday,2020-11-18 00:07:02,MitsubishiHVAC,trane,0
2,Has text,"Tournaments come and go, but the pictures are ...",[ENGLISH],Link exists and is accessible,https://pbs.twimg.com/media/Dm0dpR7X0AEHCvO?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2018,September,254,Tuesday,2018-09-11 14:23:02,GettySport,getty images,120
3,Has text,Monster Hunter Rise Concept Art: Great Izuchi....,[ENGLISH],Link exists and is accessible,https://pbs.twimg.com/media/Embi-1aUYAEK4hb?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2020,November,315,Tuesday,2020-11-10 03:00:26,MHinfo_en,monster,567
4,Has text,"With our #SalesDevelopmentProgram, you'll lear...",[ENGLISH],Link exists and is accessible,https://pbs.twimg.com/media/DtM-hi2UwAAWH2i?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2018,November,333,Thursday,2018-11-29 22:11:56,UnitedRentals,united rentals,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59326,Has text,Real Madrid Consider Huge Swap Deal For Arsena...,[ENGLISH],Link exists and is accessible,https://pbs.twimg.com/media/EKnxk0iU4AEiUYC?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2019,November,334,Saturday,2019-11-30 12:40:07,IndependentNGR,independent,0
59327,Has text,The founding of this <mention> club and the ro...,[ENGLISH],Link does not exist,Image does not exist,Link exists and is accessible,https://pbs.twimg.com/amplify_video_thumb/1009...,Link exists and is accessible,https://video.twimg.com/amplify_video/10092287...,320000,...,GIF does not exist,Does not exist,2018,July,203,Sunday,2018-07-22 13:08:00,Rotary,rotary international,144
59328,Has text,"NDDC Still Implementing 2018 Budget As FG, Oth...",[ENGLISH],Link exists and is accessible,https://pbs.twimg.com/media/EGdIA8TVAAEPa5N?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,GIF does not exist,Does not exist,2019,October,282,Wednesday,2019-10-09 17:59:33,IndependentNGR,independent,0
59329,Has text,I'm incredibly pumped for #XFL alum <mention>\...,[ENGLISH],Link does not exist,Image does not exist,Link exists and is accessible,https://pbs.twimg.com/amplify_video_thumb/1339...,Link exists and is accessible,https://video.twimg.com/amplify_video/13396572...,288000,...,GIF does not exist,Does not exist,2020,December,352,Thursday,2020-12-17 19:45:01,PatMcAfeeShow,mcafee,174


In [None]:
train.iloc[-1]

id                                                             300000
date                                              2018-08-26 01:19:09
likes                                                             714
content             In 1967, he was shot down over Vietnam. By 200...
username                                                         cnni
media               [Video(thumbnailUrl='https://pbs.twimg.com/ext...
inferred company                                                  cnn
Name: 59330, dtype: object

In [None]:
j = dataset1.loc[dataset1['gif_bitrate'].astype(str) == "0    0\nName: gif_bitrate, dtype: object"].index
for i in j:
  dataset1.at[i, 'gif_bitrate'] = 0

In [None]:
# dataset1.to_csv('/content/drive/MyDrive/train_full.csv', index=False)
dataset1.to_csv('/content/drive/MyDrive/test_full.csv', index=False)