In [None]:
!pip install pysentimiento

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
def json_to_df(json_file):
  with open(json_file) as f:
    data = json.load(f)
  text_col = []
  source_col = []
  following_col = []
  followers_col = []
  share_col = []
  like_col = []
  reply_col = []
  country_col = []
  gender_col = []
  for document in data:
    text = document["body"]
    try:
      source = document["source_device"]
    except:
      source = 'unknown'
    try:
      gender = document["gender"]
    except:
      country = gender = 'unknown'
    try:
      country = document["geocoding_country"]["display_name"]
    except:
      country = 'unknown'
    try:
      num_following = document["followers_count_num"]
    except:
      num_following = 0
    try:
      num_followers = document["followers_count_num"]
    except:
      num_following = 0
    try:
      num_like = document["like_action_count"]
    except:
      num_like = 0
    try:
      num_share = document["share_action_count"]
    except:
      num_share = 0
    try:
      num_reply = document["reply_action_count"]
    except:
      num_reply = 0
    text_col.append(text)
    source_col.append(source)
    gender_col.append(gender)
    country_col.append(country)
    following_col.append(num_following)
    followers_col.append(num_followers)
    like_col.append(num_like)
    share_col.append(num_share)
    reply_col.append(num_reply)
  return pd.DataFrame({'text': text_col, 'source': source_col, 'gender': gender_col,
                       'country': country_col, 'following': following_col, 'followers': followers_col,
                       'like': like_col, 'share': share_col, 'reply': reply_col})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

FIGUERES_PATH = '/content/drive/My Drive/Naveler/03.TECH/Anàlisis de sentiment/Dumps marques/query_555_2021-06-30_2022-06-30.json'
CHAVES_PATH = '/content/drive/My Drive/Naveler/03.TECH/Anàlisis de sentiment/Dumps marques/query_609_2021-06-30_2022-06-30.json'

figueres_df = json_to_df(FIGUERES_PATH)
chaves_df = json_to_df(CHAVES_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Figueres

In [None]:
figueres_df.head()

Unnamed: 0,text,source,gender,country,following,followers,like,share,reply
0,Carta Abierta a la señora Vanessa Hernández Sá...,Twitter for Android,unknown,Colombia,876,876,30,15,0
1,@crhoycom Y de dónde pensará @figuerescr sacar...,Twitter for iPhone,male,unknown,111,111,14,2,0
2,@crhoycom Vaya mierda @figuerescr,Twitter for Android,unknown,unknown,265,265,4,1,0
3,"@FelipeGR2020 @crhoycom ""Sin embargo, agregó q...",Twitter for Android,male,unknown,93,93,0,0,0
4,@FelipeGR2020 @crhoycom @figuerescr Ahh claro!...,Twitter for iPhone,male,unknown,111,111,0,0,0


In [None]:
figueres_df["source"].value_counts()

Twitter for Android      31507
Twitter for iPhone       20690
Twitter Web App          11329
unknown                   1166
TweetDeck                  672
                         ...  
Zendesk                      1
INoticiasCL_APP_WRI_2        1
JustoGuerrero                1
Teleradio Americas           1
Diario Neuquino Auto         1
Name: source, Length: 100, dtype: int64

In [None]:
unique_sources = figueres_df.source.unique()
coded_sources = {}
for i, source in enumerate(unique_sources):
  coded_sources[source] = i
figueres_df["source"] = figueres_df.source.replace(coded_sources)

In [None]:
figueres_df["gender"].value_counts()

male       31617
unknown    28246
female      7468
Name: gender, dtype: int64

In [None]:
unique_genders = figueres_df.gender.unique()
coded_genders = {}
for i, gender in enumerate(unique_genders):
  coded_genders[gender] = i
figueres_df["gender"] = figueres_df.gender.replace(coded_genders)

In [None]:
figueres_df["country"].value_counts()

Costa Rica                                                                             33068
unknown                                                                                30096
Mexico                                                                                   847
Colombia                                                                                 688
Spain                                                                                    636
Nicaragua                                                                                334
Chile                                                                                    289
Panama                                                                                   220
Cuba                                                                                     166
Calle América, Colectora, Santurce, San Juan, Puerto Rico, United States of America      163
Ecuador                                                               

In [None]:
costa_rica = []
for country in figueres_df["country"]:
  if country == 'Costa Rica':
    costa_rica.append(1)
  else:
    costa_rica.append(0)
figueres_df["Costa Rica"] = pd.DataFrame(costa_rica)

In [None]:
unique_countries = figueres_df.country.unique()
coded_countries = {}
for i, location in enumerate(unique_countries):
  coded_countries[location] = i
figueres_df["country"] = figueres_df.country.replace(coded_countries)

In [None]:
from pysentimiento import create_analyzer
sentiment_analyzer = create_analyzer(task="sentiment", lang="es")

In [None]:
figueres_prob_pos = []
figueres_prob_neu = []
figueres_prob_neg = []
outputs = sentiment_analyzer.predict(figueres_df["text"].to_list())
for output in outputs:
  figueres_prob_pos.append(output.probas["POS"])
  figueres_prob_neu.append(output.probas["NEU"])
  figueres_prob_neg.append(output.probas["NEG"])



  0%|          | 0/2105 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 67331
  Batch size = 32


In [None]:
figueres_scores = np.array(figueres_prob_pos) + 0.5*np.array(figueres_prob_neu) #score = 1*prob_pos + 0.5*prob_neu + 0*prob_neg
figueres_df["score"] = figueres_scores

In [None]:
figueres_df.head()

Unnamed: 0,text,source,gender,country,following,followers,like,share,reply,Costa Rica,score
0,Carta Abierta a la señora Vanessa Hernández Sá...,0,0,0,876,876,30,15,0,0,0.42963
1,@crhoycom Y de dónde pensará @figuerescr sacar...,1,1,1,111,111,14,2,0,0,0.002254
2,@crhoycom Vaya mierda @figuerescr,0,0,1,265,265,4,1,0,0,0.001656
3,"@FelipeGR2020 @crhoycom ""Sin embargo, agregó q...",0,1,1,93,93,0,0,0,0,0.589855
4,@FelipeGR2020 @crhoycom @figuerescr Ahh claro!...,1,1,1,111,111,0,0,0,0,0.108665


In [None]:
X_train, X_test, y_train, y_test = train_test_split(figueres_df[["source", "gender", "country", "following", "followers", "like", "share", "reply", "Costa Rica"]], figueres_df["score"],test_size=0.2)

In [None]:
figueres_model = RandomForestRegressor()
figueres_model.fit(X_train, y_train)

RandomForestRegressor()

In [None]:
# Evaluate on training data
train_RMSE = np.sqrt(np.mean(figueres_model.predict(X_train)-y_train)**2)
print(f'Train RMSE:\t{train_RMSE:4f}')

# Evaluate on test data
test_RMSE = np.sqrt(np.mean(figueres_model.predict(X_test)-y_test)**2)
print(f'Test RMSE:\t{test_RMSE:.4f}')

Train RMSE:	0.000551
Test RMSE:	0.0067


In [None]:
source  = 'Twitter for Android'
gender = 'male'
country = 'Costa Rica'
following = 50
followers = 0
like = 25
share = 10
reply = 3
if country == 'Costa Rica':
  costa_rica = 1
else:
  costa_rica = 0

sample = np.array([coded_sources[source], coded_genders[gender], coded_countries[country], following, followers, like, share, reply, costa_rica]).reshape(1, -1)
figueres_model.predict(sample)

  "X does not have valid feature names, but"


array([0.3232339])

Chaves

In [None]:
chaves_df["source"].value_counts()

Twitter for Android    46985
Twitter for iPhone     31843
Twitter Web App        20549
TweetDeck               1721
Echobox                  923
                       ...  
Fogos.pt                   1
radiosantafe               1
Tweepsmap                  1
williamsv76                1
El Markovian               1
Name: source, Length: 199, dtype: int64

In [None]:
unique_sources = chaves_df.source.unique()
coded_sources = {}
for i, source in enumerate(unique_sources):
  coded_sources[source] = i
chaves_df["source"] = chaves_df.source.replace(coded_sources)

In [None]:
chaves_df["gender"].value_counts()

unknown    46073
male       44145
female     15405
Name: gender, dtype: int64

In [None]:
unique_genders = chaves_df.gender.unique()
coded_genders = {}
for i, gender in enumerate(unique_genders):
  coded_genders[gender] = i
chaves_df["gender"] = chaves_df.gender.replace(coded_genders)

In [None]:
chaves_df["country"].value_counts()

Costa Rica                                                                                      50722
unknown                                                                                         44925
Colombia                                                                                         2181
Mexico                                                                                           1572
Spain                                                                                            1308
Nicaragua                                                                                         632
Panama                                                                                            597
Ecuador                                                                                           559
Cuba                                                                                              550
Dominican Republic                                                                

In [None]:
costa_rica = []
for country in chaves_df["country"]:
  if country == 'Costa Rica':
    costa_rica.append(1)
  else:
    costa_rica.append(0)
chaves_df["Costa Rica"] = pd.DataFrame(costa_rica)

In [None]:
unique_countries = chaves_df.country.unique()
coded_countries = {}
for i, location in enumerate(unique_countries):
  coded_countries[location] = i
chaves_df["country"] = chaves_df.country.replace(coded_countries)

In [None]:
chaves_prob_pos = []
chaves_prob_neu = []
chaves_prob_neg = []
outputs = sentiment_analyzer.predict(chaves_df["text"].to_list())
for output in outputs:
  chaves_prob_pos.append(output.probas["POS"])
  chaves_prob_neu.append(output.probas["NEU"])
  chaves_prob_neg.append(output.probas["NEG"])

  0%|          | 0/3301 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 105623
  Batch size = 32


In [None]:
chaves_scores = np.array(chaves_prob_pos) + 0.5*np.array(chaves_prob_neu) #score = 1*prob_pos + 0.5*prob_neu + 0*prob_neg
chaves_df["score"] = chaves_scores

In [None]:
X_train, X_test, y_train, y_test = train_test_split(chaves_df[["source", "gender", "country", "following", "followers", "like", "share", "reply", "Costa Rica"]], chaves_df["score"],test_size=0.2)

In [None]:
chaves_model = RandomForestRegressor()
chaves_model.fit(X_train, y_train)

RandomForestRegressor()

In [None]:
# Evaluate on training data
train_RMSE = np.sqrt(np.mean(chaves_model.predict(X_train)-y_train)**2)
print(f'Train RMSE:\t{train_RMSE:4f}')

# Evaluate on test data
test_RMSE = np.sqrt(np.mean(chaves_model.predict(X_test)-y_test)**2)
print(f'Test RMSE:\t{test_RMSE:.4f}')

Train RMSE:	0.001184
Test RMSE:	0.0077


In [None]:
source  = 'Twitter for Android'
gender = 'male'
country = 'Costa Rica'
following = 50
followers = 0
like = 25
share = 10
reply = 3
if country == 'Costa Rica':
  costa_rica = 1
else:
  costa_rica = 0

sample = np.array([coded_sources[source], coded_genders[gender], coded_countries[country], following, followers, like, share, reply, costa_rica]).reshape(1, -1)
chaves_model.predict(sample)

  "X does not have valid feature names, but"


array([0.49286677])