## Setup

In [18]:
%pip install pycld2
!pip install colorthief
!pip install swifter



In [4]:
import pycld2
import numpy as np
import io
import urllib.request
from PIL import Image
from colorthief import ColorThief
import colorsys
from sklearn.cluster import KMeans
from io import BytesIO
import skimage.measure
import requests
import re
import pandas as pd
from datetime import datetime
import swifter
import regex as re
from textblob import TextBlob
from datetime import datetime

In [5]:
# df = pd.read_csv('train.csv')
# column = ['text_status', 'text', 'languages', 'image_url_status', 'image_url', 'video_thumbnail_status', 'video_thumbnail_url', 'video_url_status', 'video_url', 'video_bitrate', 'video_duration', 'video_views', 'gif_thumbnail_status', 'gif_thumbnail_url', 'gif_url_status', 'gif_url', 'gif_bitrate', 'year', 'month', 'day_of_year', 'day_of_week', 'date', 'username', 'inferred company', 'likes']
# eda = pd.DataFrame(columns = column)

# Pipeline

In [19]:
class EDA_Pipeline:
    def __init__(self, data: pd.DataFrame):
        self.data = data

    @staticmethod
    def detect_languages(text):
        _, _, details = pycld2.detect(text)
        langs = [lang[0] for lang in details if lang[2] >= 50]
        return langs if langs else ['Unknown']

    @staticmethod
    def is_url_active(url):
        try:
            response = requests.head(url, timeout=5)
            return 200 <= response.status_code < 400
        except requests.RequestException:
            return False

    @staticmethod
    def parse_datetime_str(datetime_str):
        dt_object = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
        return [dt_object.year, dt_object.strftime("%B"),
                dt_object.timetuple().tm_yday, dt_object.strftime('%A')]

    @staticmethod
    def extract_media_data(media_type, input_string):
        if media_type == 'photo':
            pattern = r"Photo\(previewUrl='(.*?)', fullUrl='(.*?)'\)"
        elif media_type == 'gif':
            pattern = r"Gif\(thumbnailUrl='(.*?)', variants=\[VideoVariant\(contentType='video/mp4', url='(.*?)', bitrate=(\d+)\)\]\)"
        else:  # media_type is 'video'
            pattern = r"Video\(thumbnailUrl='(.*?)'"
        return re.findall(pattern, input_string)

    @staticmethod
    def video_param(input_string):
        # Extract video data
        min_bitrate = float('inf')
        min_bitrate_video_url = None
        thumbnail_url = re.search(r"thumbnailUrl='(.*?)'", input_string).group(1)

        for video_url, bitrate_str in EDA_Pipeline.extract_media_data('video', input_string):
            bitrate = int(bitrate_str) if bitrate_str != "None" else float('inf')
            if bitrate < min_bitrate:
                min_bitrate = bitrate
                min_bitrate_video_url = video_url

        duration = re.search(r"duration=(\d+\.\d+)", input_string)
        views = re.search(r"views=(\d+)", input_string)

        return {
            "thumbnail_url": thumbnail_url,
            "min_bitrate_video_url": min_bitrate_video_url,
            "min_bitrate": min_bitrate,
            "duration": float(duration.group(1)) if duration else None,
            "views": int(views.group(1)) if views else None
        }

    @staticmethod
    def get_media_type(media_string):
        if 'Photo(' in media_string:
            return 'image'
        elif 'Video(' in media_string:
            return 'video'
        elif 'Gif(' in media_string:
            return 'gif'
        else:
            return 'unknown'

    # count emoji
    @staticmethod
    def count_emojis(text):
      emoji_pattern = re.compile(r'\p{Emoji}', flags=re.UNICODE)
      emojis = emoji_pattern.findall(text)
      return len(emojis)

    # count hashtag
    @staticmethod
    def count_hashtags(x):
      return x.count('#')

    # count uppercase
    @staticmethod
    def count_uppercase_words(x):
      count = sum(word.isupper() for word in x.split())
      return count

    # count symbols
    @staticmethod
    def count_at_symbols(x):
      return x.count('@')

    # get sentiment score
    @staticmethod
    def get_sentiment_polarity(x):
      analysis = TextBlob(x)
      return analysis.sentiment.polarity

    # get subjectivity score
    @staticmethod
    def get_subjectivity(x):
      analysis = TextBlob(x)
      return analysis.sentiment.subjectivity

    # get word count
    @staticmethod
    def word_count(x):
      return len(x.split())

# image features

    # get image
    @staticmethod
    def get_image(media_string):
      media_string = media_string[1:-1]
      media_type = EDA_Pipeline.get_media_type(media_string)
      if media_type == 'image':
        url = EDA_Pipeline.extract_media_data('photo', media_string)[0][0]
        try:
          response = urllib.request.urlopen(url)
          image_data = response.read()
          image = Image.open(io.BytesIO(image_data))
          return image
        except Exception as e:
          return None
      if media_type == 'video':
        url = EDA_Pipeline.extract_media_data('video', media_string)[0]
        try:
          response = urllib.request.urlopen(url)
          image_data = response.read()
          image = Image.open(io.BytesIO(image_data))
          return image
        except Exception as e:
          return None
      if media_type == 'gif':
        url = EDA_Pipeline.extract_media_data('gif', media_string)[0][0]
        try:
          response = urllib.request.urlopen(url)
          image_data = response.read()
          image = Image.open(io.BytesIO(image_data))
          return image
        except Exception as e:
          return None

    # get size
    @staticmethod
    def get_size(media_string):
      image = EDA_Pipeline.get_image(media_string)
      if image is not None:
        size=image.size
        # asp_ratio = size[0] / size[1]
        return size
      else:
        return np.nan

    # get aspect ratio
    @staticmethod
    def get_aspect_ratio(media_string):
      image = EDA_Pipeline.get_image(media_string)
      if image is not None:
        size=image.size
        asp_ratio = size[0] / size[1]
        return asp_ratio
      else:
        return np.nan

    # get brightness
    @staticmethod
    def get_brightness(media_string):
      image = EDA_Pipeline.get_image(media_string)
      if image is not None:
        image_gray = image.convert("L")
        brightness = int(sum(image_gray.getdata()) / len(image_gray.getdata()))
        return brightness
      else:
        return np.nan

    # get saturation
    @staticmethod
    def get_saturation(media_string):
      image = EDA_Pipeline.get_image(media_string)
      if image is not None:
        image_rgb = image.convert("RGB")
        saturation_sum = 0
        pixel_count = 0
        for r, g, b in image_rgb.getdata():
            h, l, s = colorsys.rgb_to_hls(r / 255.0, g / 255.0, b / 255.0)
            saturation_sum += s
            pixel_count += 1
        average_saturation = saturation_sum / pixel_count
        return average_saturation
      else:
        return np.nan

    # get entropy
    @staticmethod
    def get_dom_col_entropy(media_string):
      image = EDA_Pipeline.get_image(media_string)
      if image is not None:
        # img_array = np.array(image)
        # pixels = img_array.reshape((-1, 3))
        # kmeans = KMeans(n_clusters=1)
        # kmeans.fit(pixels)
        # centroids = kmeans.cluster_centers_
        return skimage.measure.shannon_entropy(image)
      else:
        return np.nan

    def process(self):
        # Process
        self.data['languages'] = self.data['content'].swifter.apply(self.detect_languages)
        self.data['is_url_active'] = self.data['media'].swifter.apply(self.is_url_active)
        # self.data[['year', 'month', 'day_of_year', 'day_of_week']] = self.data.apply(lambda row: self.parse_datetime_str(row['date']), axis=1, result_type='expand')
        # video_data_df = self.data['media'].swifter.apply(lambda url: pd.Series(self.video_param(url)) if 'video' in url else pd.Series([None]*5, index=['thumbnail_url', 'min_bitrate_video_url', 'min_bitrate', 'duration', 'views']))
        # self.data = pd.concat([self.data, video_data_df], axis=1)

        self.data['media_type'] = self.data['media'].swifter.apply(self.get_media_type)
        self.data['emoji_count'] = self.data['content'].swifter.apply(self.count_emojis)
        self.data['hastag_count'] = self.data['content'].swifter.apply(self.count_hashtags)
        self.data['uppercase_count'] = self.data['content'].swifter.apply(self.count_uppercase_words)
        self.data['at_count'] = self.data['content'].swifter.apply(self.count_at_symbols)
        self.data['sentiment_score'] = self.data['content'].swifter.apply(self.get_sentiment_polarity)
        self.data['subjectivity_score'] = self.data['content'].swifter.apply(self.get_subjectivity)
        self.data['word_count'] = self.data['content'].swifter.apply(self.word_count)

        self.data['image_size'] = self.data['media'].swifter.apply(self.get_size)
        # self.data['aspect_ratio'] = self.data['media'].swifter.apply(self.get_aspect_ratio)
        # self.data['brightness'] = self.data['media'].swifter.apply(self.get_brightness)
        # self.data['saturation'] = self.data['media'].swifter.apply(self.get_saturation)
        # self.data['entropy'] = self.data['media'].swifter.apply(self.get_dom_col_entropy)

        return self.data

In [7]:
def number_of_reposts(df):
  repost_list = []
  content_full = df['tweet text'].values
  counter = 0
  for content in content_full:
    print(counter)
    count = 0
    sample_df = df[df['tweet text'] == content]
    repost_list.append(len(sample_df)-1)
    counter += 1
  df["number_of_reposts"] = repost_list
  return df

In [8]:
def frequency_of_post(df):
  num_post_dict = {}
  unique_usernames = df['username'].unique()

  num_pos_dict = {}

  for i in range (len(unique_usernames)):
    num_post_dict[unique_usernames[i]] = len(df[train_df['username'] == unique_usernames[i]])

  freq_list = []
  max_posts = max(num_pos_dict.values())

  for index, row in df.iterrows():
    freq_list.append(round(num_post_dict[row['username']] / max_posts, 4))

  df['frequency_of_posts'] = freq_list

  return df

In [9]:
def total_past_posts(df):
  for index, row in df.iterrows():
    print(index)
    date = datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
    sample_df = df[df['username'] == row['username']]
    sample_df = sample_df[sample_df['date'] < row['date']]
    df.loc[index, 'total_past_posts'] = len(sample_df)
  return df

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
train_df = pd.read_csv("/content/drive/MyDrive/test_full.csv")
new_train_df = pd.read_csv("/content/drive/MyDrive/test.csv")
train_df['media'] = new_train_df['media']
train_df.rename(columns={'text': 'content'}, inplace=True)
train_df.head()

Unnamed: 0,text_status,content,languages,image_url_status,image_url,video_thumbnail_status,video_thumbnail_url,video_url_status,video_url,video_bitrate,...,gif_bitrate,year,month,day_of_year,day_of_week,date,username,inferred company,likes,media
0,Has text,Congratulations to Pauletha Butts of <mention>...,['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/Dp4L0cSUcAAh9JG?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,Does not exist,2018,October,292,Friday,2018-10-19 14:30:46,BGISD,independent,41,[Photo(previewUrl='https://pbs.twimg.com/media...
1,Has text,#NetZeroHomes: learn the best way to get to ze...,['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/EnEIA1NXcAQIJw8?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,Does not exist,2020,November,323,Wednesday,2020-11-18 00:07:02,MitsubishiHVAC,trane,0,[Photo(previewUrl='https://pbs.twimg.com/media...
2,Has text,"Tournaments come and go, but the pictures are ...",['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/Dm0dpR7X0AEHCvO?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,Does not exist,2018,September,254,Tuesday,2018-09-11 14:23:02,GettySport,getty images,120,[Photo(previewUrl='https://pbs.twimg.com/media...
3,Has text,Monster Hunter Rise Concept Art: Great Izuchi....,['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/Embi-1aUYAEK4hb?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,Does not exist,2020,November,315,Tuesday,2020-11-10 03:00:26,MHinfo_en,monster,567,[Photo(previewUrl='https://pbs.twimg.com/media...
4,Has text,"With our #SalesDevelopmentProgram, you'll lear...",['ENGLISH'],Link exists and is accessible,https://pbs.twimg.com/media/DtM-hi2UwAAWH2i?fo...,Link does not exist,Video thumbnail does not exist,Link does not exist,Video does not exist,Does not exist,...,Does not exist,2018,November,333,Thursday,2018-11-29 22:11:56,UnitedRentals,united rentals,1,[Photo(previewUrl='https://pbs.twimg.com/media...


In [None]:
train_df = train_df
eda_instance = EDA_Pipeline(train_df)
df = eda_instance.process()

In [None]:
df.to_csv('/content/drive/MyDrive/test_full_2_img_sizes')