In [None]:
import numpy as np
import pandas as pd
import re
import itertools

from wordcloud import WordCloud, STOPWORDS
from collections import Counter

import sklearn

import emoji

import matplotlib.pyplot as plt
import seaborn as sns
import squarify

## Load the dataset

In [None]:
df_posts = pd.read_csv('/kaggle/input/rigadsclub/linkedin/riga-ds-club_updates_engagement_metrics.csv')
df_posts.head()

In [None]:
df_posts.dtypes

In [None]:
df_followers = pd.read_csv('/kaggle/input/rigadsclub/linkedin/riga-ds-club_followers_by_time.csv')
df_followers.head()

In [None]:
df_followers.dtypes

## Which columns should we leave?

In [None]:
def print_unique(column_names):
    for column_name in column_names:
        print(f"Unique '{column_name}' values: {df_posts[column_name].unique()}")

In [None]:
print_unique(['Update type', 'Audience', 'Follows', 'Posted by'])

In [None]:
print('Number of video posts:', len(df_posts[df_posts['Video views'].notna()]))

In [None]:
df_posts = df_posts.drop(columns=['Update type', 'Update link', 'Campaign name', 'Campaign start date', 'Campaign end date', 'Audience', 'Follows'])
df_posts.head()

## General statistics

In [None]:
print('Total posts:', len(df_posts))
print('Total impressions:', df_posts['Impressions'].sum())
print('Total likes:', df_posts['Likes'].sum())
print('Total comments:', df_posts['Comments'].sum())
print('Total video views:', df_posts['Video views'].sum().astype('int64'))

## Text cleaning

Let's start by exploring some posts:

In [None]:
def get_post(n):
    return df_posts.at[n, 'Update title']

In [None]:
for n in range(5):
    print(get_post(n), f'\n{"="*80}')

* Hashtags might be useful for topic modelling. We should extract them as a separate feature.
* Some posts contain emojis, which might have an impact.
* There are some URLs in post body we should get rid of.
* We will also create lowercase post body copy without everything mentioned above and no punctuation.

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df_posts['text'] = df_posts['Update title'].apply(lambda x: clean_text(x))

In [None]:
def get_text(n):
    return df_posts.at[n, 'text']

In [None]:
get_text(0)

## Extracting the Hashtags

In [None]:
def get_hashtags(text):
    return [hashtag.lower() for hashtag in re.findall("(#[A-Za-z]+[A-Za-z0-9-_]+)", text)]

In [None]:
get_hashtags(get_post(0))

In [None]:
df_posts['hashtags'] = df_posts['Update title'].apply(lambda x: ' '.join(get_hashtags(x)))
df_posts.head()

In [None]:
def get_unique_hashtags():
    return df_posts['hashtags'].str.split(expand=True).stack().unique()

In [None]:
hashtags = get_unique_hashtags()
print('Hashtags:', hashtags)
print('Count', len(hashtags))

In [None]:
df_posts['hashtags'].str.split(expand=True).stack().value_counts()

## Extracting emojis

In [None]:
def get_emojis(text):
    emojis = re.findall(r'(:[!_\-\w]+:)', emoji.demojize(text))
    emojis = [emoji.emojize(x) for x in emojis]
    return ' '.join(emojis)

In [None]:
df_posts['emojis'] = df_posts['Update title'].apply(lambda x: get_emojis(x))

In [None]:
df_posts.head()

In [None]:
df_posts['emojis'].str.split(expand=True).stack().unique()

## Word count

In [None]:
def get_word_count(sentence):
    return len(str(sentence).split())

In [None]:
df_posts['word_count'] = df_posts['text'].apply(lambda x: get_word_count(x))
df_posts.head()

In [None]:
df_posts.describe()

In [None]:
df_posts['Engagement rate'].hist()

In [None]:
df_posts['word_count'].hist()

## Saving results

In [None]:
df_posts.head()

In [None]:
df_posts.to_csv('output.csv')