In [298]:
# Libraries for data management
import os
import json
import pandas as pd
import numpy as np
import ast

#FOR PROCESSING
import nltk
import re
nltk.download("omw-1.4")

#FORR W2V
import gensim
import gensim.downloader as gensim_api

#FOR PLOTTING
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#FOR BERT MODEL -> TO STUDY BERT MODEL
import transformers


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jesustellez/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [299]:
""" TIME SERIES WITH #VIDEOS, LIKES, COMMENTS """

class EDA_Analysis():

    def __init__(self) -> None:
        def data_read_csv(path, title):
            video = path + "/" + title
            df_ = pd.read_csv(video)

            return df_

        path = "/Users/jesustellez/Desktop/aiDynamics/API Data extraction/Data"
        df_comments = data_read_csv(path, "comments_eeuu.csv")
        self.df_comments = df_comments
        df_videos = data_read_csv(path, "videos_eeuu.csv")
        df_videos["Date"] = df_videos["publishedAt"].str.split("T", expand=True)[0]
        self.df_videos = df_videos
        df_calendar = data_read_csv(path, "calendar.csv")
        df_calendar["Week-Year"] = df_calendar["Week"].astype(str) + "-" + df_calendar["Year"].astype(str)
        self.df_calendar = df_calendar

    def __returndata__(self):
        return self.df_videos, self.df_calendar, self.df_comments
    
    def EDA_timely(self, period):

        df_ts_analysis = self.df_videos.merge(
            self.df_calendar[["Date", "Day", "Week","Month", "Month-Year", "Week-Year"]], 
            on="Date", 
            how="left")
        
        df_ts_analysis_wy = df_ts_analysis.groupby(period, as_index=False).agg({"viewCount":"sum", "video_id":"count","commentCount":"sum"})
        return df_ts_analysis_wy

    def EDA_time_graph(self, period, fields, df_analysis):

        fig = go.Figure()
        for i in fields:
            fig.add_trace(go.Scatter(x=df_analysis[period], y=df_analysis[i]))

        fig.show()

df_videos, df_calendar, df_comments = EDA_Analysis().__returndata__()


In [300]:
class m_tags():
    def __init__(self,df_videos, df_tags_master, param, remove_values) -> None:
        self.df_videos = df_videos
        self.df_tags_master = df_tags_master
        self.param = param
        self.remove_values = remove_values

    def master_tags(self):
        df_videos = self.df_videos
        df_videos["tags"] = self.df_videos.tags.str.split(",")
        df_videos = df_videos.explode("tags")
        lst_stopwords = nltk.corpus.stopwords.words("english")

        df_videos["tags_clean"] = df_videos["tags"].apply(lambda x: self.clean_tags(
            x,
            flg_stemm=False,
            flg_lemm=True,
            lst_stopwords=lst_stopwords
        ))

        self.m_tags_agg = self.agg_tags(df_videos)
        m_tags_matched = self.tags_dataframe()
        m_tags_missing = self.tags_missing(m_tags_matched)

        return m_tags_matched, m_tags_missing

    def clean_tags(self, text, flg_stemm = False, flg_lemm=True, lst_stopwords = None):
        ## Clean (convert to lowercase and remove punctuation and characters and then strip)
        text = re.sub(r'[^\w\s]', '', str(text).lower())

        ## Tolenize (Convert from string to List)
        lst_text = text.split()

        ## Remove Stopwords
        if lst_stopwords is not None:
            lst_text = [word for word in lst_text if word not in lst_stopwords]

        ## Stemming (to remove -ly, -ing, etc.)
        if flg_stemm == True:
            ps = nltk.stem.porter.PorterStemmer()
            lst_text = [ps.stem(word) for word in lst_text]

        ## Lemmatisation (Convert the word into root word)
        if flg_lemm == True:
            lem = nltk.stem.wordnet.WordNetLemmatizer()
            lst_text = [lem.lemmatize(word) for word in lst_text]

        ## Back to string from list
        text = " ".join(lst_text)
        return text
    
    def agg_tags(self, df_videos):

        df_tags_analysis = df_videos.groupby('tags_clean', as_index=False).agg(
            view_sum = pd.NamedAgg(column='viewCount', aggfunc='sum'),
            like_sum = pd.NamedAgg(column='likeCount', aggfunc='sum'),
            comment_sum = pd.NamedAgg(column='commentCount', aggfunc='sum')
        )

        df_tags_analysis = df_tags_analysis.loc[df_tags_analysis["like_sum"]>self.param]

        df_tags_analysis = df_tags_analysis[~df_tags_analysis["tags_clean"].isin(self.remove_values)]

        return df_tags_analysis

    def tags_dataframe(self):
        tags_dataframe = pd.DataFrame()

        for i,j in self.df_tags_master.iterrows():
            link = self.m_tags_agg[self.m_tags_agg.tags_clean.str.contains(j.key_word)]
            link["key_word"] = j.key_word
            link["categorie"]= j.categorie

            tags_dataframe = tags_dataframe.append(link)

        return tags_dataframe

    def tags_missing(self, m_tags_matched):
        analysis = self.m_tags_agg.merge(m_tags_matched[["tags_clean", "categorie"]], on="tags_clean", how="left")
        analysis = analysis[analysis["categorie"].isnull()]
        analysis = analysis[["tags_clean", "categorie"]]

        return analysis


In [301]:
df_tags_master =  pd.read_csv("param_labels.csv")
param = 500
remove_values = ["breaking news", "live video", "toriginal", "real time coverage", "news", "washington post", "anational", "apolitics", "spolitics", "snational", 
"anational", "washington post video" ,"md va", "aworld", "sworld", "wapo video"]

aux = m_tags(df_videos, df_tags_master, param, remove_values)
tags_matched, tags_mising = aux.master_tags()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

In [303]:
#tags_matched
tags_mising

Unnamed: 0,tags_clean,categorie
0,,
1,10 downing street,
2,10 year anniverary trayvon martin,
3,11 de julio,
4,14th amendment,
...,...,...
3164,zinc,
3165,zoe lofgren,
3166,zoeann murphy,
3167,zoo,
