# Data Preparation

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import warnings
import powerlaw
import matplotlib.colors as mcolors
import ast
import time

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.0f}'.format


## Import .csv files + (very) basic EDA and data preparation
In this section we:
* read the four csv files (one per semester)
* concat all the five csv into a single one
* drop useless features


In [12]:
df_primo = pd.read_csv("../data_collection/data_biden/primo_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')
df_secondo = pd.read_csv("../data_collection/data_biden/secondo_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')
df_terzo = pd.read_csv("../data_collection/data_biden/terzo_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')
df_quarto = pd.read_csv("../data_collection/data_biden/quarto_semestre.csv", na_filter=True, na_values='[]', lineterminator='\n')
df_quinto = pd.read_csv("../data_collection/data_biden/quinto_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')

In [13]:
df_primo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144711 entries, 0 to 144710
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       144711 non-null  int64  
 1   id               144711 non-null  int64  
 2   conversation_id  144711 non-null  int64  
 3   created_at       144711 non-null  object 
 4   date             144711 non-null  object 
 5   time             144711 non-null  object 
 6   timezone         144711 non-null  int64  
 7   user_id          144711 non-null  int64  
 8   username         144711 non-null  object 
 9   name             144705 non-null  object 
 10  place            128 non-null     object 
 11  tweet            144711 non-null  object 
 12  language         144711 non-null  object 
 13  mentions         31200 non-null   object 
 14  urls             38819 non-null   object 
 15  photos           21480 non-null   object 
 16  replies_count    144711 non-null  int6

In [14]:
df_secondo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147576 entries, 0 to 147575
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       147576 non-null  int64  
 1   id               147576 non-null  int64  
 2   conversation_id  147576 non-null  int64  
 3   created_at       147576 non-null  object 
 4   date             147576 non-null  object 
 5   time             147576 non-null  object 
 6   timezone         147576 non-null  int64  
 7   user_id          147576 non-null  int64  
 8   username         147576 non-null  object 
 9   name             147564 non-null  object 
 10  place            150 non-null     object 
 11  tweet            147576 non-null  object 
 12  language         147576 non-null  object 
 13  mentions         28416 non-null   object 
 14  urls             43072 non-null   object 
 15  photos           23848 non-null   object 
 16  replies_count    147576 non-null  int6

In [15]:
df_terzo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125386 entries, 0 to 125385
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       125386 non-null  int64  
 1   id               125386 non-null  int64  
 2   conversation_id  125386 non-null  int64  
 3   created_at       125386 non-null  object 
 4   date             125386 non-null  object 
 5   time             125386 non-null  object 
 6   timezone         125386 non-null  int64  
 7   user_id          125386 non-null  int64  
 8   username         125386 non-null  object 
 9   name             125134 non-null  object 
 10  place            128 non-null     object 
 11  tweet            125386 non-null  object 
 12  language         125386 non-null  object 
 13  mentions         25808 non-null   object 
 14  urls             50035 non-null   object 
 15  photos           24690 non-null   object 
 16  replies_count    125386 non-null  int6

In [16]:
df_quarto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132830 entries, 0 to 132829
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       132830 non-null  int64  
 1   id               132830 non-null  int64  
 2   conversation_id  132830 non-null  int64  
 3   created_at       132830 non-null  object 
 4   date             132830 non-null  object 
 5   time             132830 non-null  object 
 6   timezone         132830 non-null  int64  
 7   user_id          132830 non-null  int64  
 8   username         132830 non-null  object 
 9   name             132541 non-null  object 
 10  place            86 non-null      object 
 11  tweet            132830 non-null  object 
 12  language         132830 non-null  object 
 13  mentions         25237 non-null   object 
 14  urls             46863 non-null   object 
 15  photos           21605 non-null   object 
 16  replies_count    132830 non-null  int6

In [17]:
df_quinto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123991 entries, 0 to 123990
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       123991 non-null  int64  
 1   id               123991 non-null  int64  
 2   conversation_id  123991 non-null  int64  
 3   created_at       123991 non-null  object 
 4   date             123991 non-null  object 
 5   time             123991 non-null  object 
 6   timezone         123991 non-null  int64  
 7   user_id          123991 non-null  int64  
 8   username         123991 non-null  object 
 9   name             123929 non-null  object 
 10  place            42 non-null      object 
 11  tweet            123991 non-null  object 
 12  language         123991 non-null  object 
 13  mentions         22315 non-null   object 
 14  urls             39464 non-null   object 
 15  photos           20054 non-null   object 
 16  replies_count    123991 non-null  int6

In [18]:
joinDF = [df_primo, df_secondo, df_terzo, df_quarto, df_quinto]

In [19]:
df = pd.concat(joinDF)


In [20]:
df

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,0,1218684471211245568,1218684471211245568,2020-01-18 23:59:54+00:00,2020-01-18,23:59:54,100,90497917,davidschor,David Schor (Biden=#BlueTrump) 💚,...,,,,,,,,,,
1,1,1218684453494632449,1218663554250727424,2020-01-18 23:59:50+00:00,2020-01-18,23:59:50,100,175711368,trayntp,Biden is the More Effective Evil,...,,,,,,"[{'screen_name': 'number1fan_2', 'name': 'Davi...",,,,
2,4,1218683884994473984,1218678939947864064,2020-01-18 23:57:34+00:00,2020-01-18,23:57:34,100,967521853,badphotography_,Robby,...,,,,,,"[{'screen_name': 'WoobieTuesday', 'name': 'Woo...",,,,
3,5,1218683765674987525,1218683765674987520,2020-01-18 23:57:06+00:00,2020-01-18,23:57:06,100,1199496918029742083,glendowek42,Camigo Lordotic⚙️🐛,...,,,,,,,,,,
4,6,1218683747417120768,1218683747417120768,2020-01-18 23:57:02+00:00,2020-01-18,23:57:02,100,543761210,mimitexasangel,Cannabis food is our Healthcare🍃🌿🌱🌳🍏,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123986,1007,1479472860208844802,1479472860208844800,2022-01-07 15:19:51+00:00,2022-01-07,15:19:51,100,1062465110,johngililland,John Gililland,...,,,,,,,,,,
123987,1008,1479472457828478976,1479472457828478976,2022-01-07 15:18:15+00:00,2022-01-07,15:18:15,100,942241440855343104,ps9714,The Great War & Modern Memory,...,,,,,,,,,,
123988,1009,1479472401763258374,1479439038423597056,2022-01-07 15:18:02+00:00,2022-01-07,15:18:02,100,1267982563,howserob,Rob Howse,...,,,,,,"[{'screen_name': 'lettywho1', 'name': 'Dr Nazr...",,,,
123989,1010,1479472233307328517,1479472233307328512,2022-01-07 15:17:22+00:00,2022-01-07,15:17:22,100,1414321685835177985,sheilachicago,Sheila,...,,,,,,,,,,


In [21]:
del df["Unnamed: 0"]

In [22]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 674494 entries, 0 to 123990
Data columns (total 36 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               674494 non-null  int64  
 1   conversation_id  674494 non-null  int64  
 2   created_at       674494 non-null  object 
 3   date             674494 non-null  object 
 4   time             674494 non-null  object 
 5   timezone         674494 non-null  int64  
 6   user_id          674494 non-null  int64  
 7   username         674494 non-null  object 
 8   name             673873 non-null  object 
 9   place            534 non-null     object 
 10  tweet            674494 non-null  object 
 11  language         674494 non-null  object 
 12  mentions         132976 non-null  object 
 13  urls             218253 non-null  object 
 14  photos           111677 non-null  object 
 15  replies_count    674494 non-null  int64  
 16  retweets_count   674494 non-null  int6

In [23]:
len(df["user_id"].unique())

162636

In [24]:
len(df["conversation_id"].value_counts(normalize=True))

636485

In [25]:
df_definitivo = df.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)

In [26]:
df_definitivo.reset_index(inplace=True)

In [27]:
df_definitivo

Unnamed: 0,index,id,conversation_id,date,time,user_id,username,name,tweet,mentions,replies_count,retweets_count,likes_count,hashtags,link,reply_to
0,0,1218684471211245568,1218684471211245568,2020-01-18,23:59:54,90497917,davidschor,David Schor (Biden=#BlueTrump) 💚,#JoeBiden accusing the Sanders campaign of dis...,,0,1,2,"['joebiden', 'berniebreathesfire', 'thetruthbe...",https://twitter.com/davidschor/status/12186844...,
1,1,1218684453494632449,1218663554250727424,2020-01-18,23:59:50,175711368,trayntp,Biden is the More Effective Evil,@number1fan_2 @BMarchetich There are 2 other v...,,0,1,0,['joebiden'],https://twitter.com/trayNTP/status/12186844534...,"[{'screen_name': 'number1fan_2', 'name': 'Davi..."
2,2,1218683884994473984,1218678939947864064,2020-01-18,23:57:34,967521853,badphotography_,Robby,@WoobieTuesday @HeatherGautney @JoeBiden @Bern...,,0,8,14,"['joebiden', 'bidensocialsecuritycuts']",https://twitter.com/BadPhotography_/status/121...,"[{'screen_name': 'WoobieTuesday', 'name': 'Woo..."
3,3,1218683765674987525,1218683765674987520,2020-01-18,23:57:06,1199496918029742083,glendowek42,Camigo Lordotic⚙️🐛,"Democrats: ""#Trump is mentally unfit for being...",,0,0,0,"['trump', 'joebiden', 'hilarious', 'hypocrisy'...",https://twitter.com/GlenDowek42/status/1218683...,
4,4,1218683747417120768,1218683747417120768,2020-01-18,23:57:02,543761210,mimitexasangel,Cannabis food is our Healthcare🍃🌿🌱🌳🍏,@DNC needs to reign in #JoeBiden ~ He can not ...,,0,0,0,"['joebiden', 'lie', 'democratic', 'voters', 'v...",https://twitter.com/MimiTexasAngel/status/1218...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674489,123986,1479472860208844802,1479472860208844800,2022-01-07,15:19:51,1062465110,johngililland,John Gililland,This looks like an SNL set. My how the resol...,,0,0,0,"['joebiden', 'shameful']",https://twitter.com/JohnGililland/status/14794...,
674490,123987,1479472457828478976,1479472457828478976,2022-01-07,15:18:15,942241440855343104,ps9714,The Great War & Modern Memory,Did you miss President Biden's stunning speech...,,0,4,11,"['biden', 'democracy']",https://twitter.com/ps9714/status/147947245782...,
674491,123988,1479472401763258374,1479439038423597056,2022-01-07,15:18:02,1267982563,howserob,Rob Howse,@lettywho1 @POTUS @WhiteHouse @StateDept As yo...,"[{'screen_name': 'rutiteitel', 'name': 'ruti t...",0,1,0,['biden'],https://twitter.com/howserob/status/1479472401...,"[{'screen_name': 'lettywho1', 'name': 'Dr Nazr..."
674492,123989,1479472233307328517,1479472233307328512,2022-01-07,15:17:22,1414321685835177985,sheilachicago,Sheila,"People who critique #Biden perplex me. Like, w...",,0,0,0,['biden'],https://twitter.com/SheilaChicago/status/14794...,


In [28]:
del df_definitivo["index"]

In [29]:
df_definitivo = df_definitivo.dropna()

In [30]:
(df_definitivo["date"] <= "2020-05-25").value_counts()

False    14294
True      5364
Name: date, dtype: int64

In [31]:
df_definitivo["retweets_count"].describe()

count   19658
mean        1
std        10
min         0
25%         0
50%         0
75%         0
max       733
Name: retweets_count, dtype: float64

In [32]:
len(df_definitivo["user_id"].unique())

8581

In [33]:
df_definitivo["reply_to"]

13        [{'screen_name': 'peterschweizer', 'name': 'Pe...
21        [{'screen_name': 'notonboard', 'name': 'Maury ...
25        [{'screen_name': 'JoyceWhiteVance', 'name': 'J...
26        [{'screen_name': 'MonaSalama_', 'name': 'Mona ...
36        [{'screen_name': 'hollyotterbein', 'name': 'Ho...
                                ...                        
673914    [{'screen_name': 'CharlieCrist', 'name': 'Char...
673975    [{'screen_name': 'POTUS', 'name': 'President B...
674101    [{'screen_name': 'TomKattman', 'name': 'Tom Ka...
674200    [{'screen_name': 'MarketRebels', 'name': 'Mark...
674491    [{'screen_name': 'lettywho1', 'name': 'Dr Nazr...
Name: reply_to, Length: 19658, dtype: object

In [34]:
df_definitivo.reset_index(inplace=True)

In [35]:
del df_definitivo["index"]

In [85]:
df_definitivo

Unnamed: 0,id,conversation_id,date,time,user_id,username,name,tweet,mentions,replies_count,retweets_count,likes_count,hashtags,link,reply_to
0,1218680146376429574,1218643531058884608,2020-01-18,23:42:43,738486583725305857,pamnsc,🇺🇸 Pamnsc 🇮🇱,@peterschweizer Hey @scdp @WhipClyburn @harri...,"[{'screen_name': 'scdp', 'name': 'south caroli...",0,1,0,"['biden', 'charleston']",https://twitter.com/pamnsc/status/121868014637...,"[{'screen_name': 'peterschweizer', 'name': 'Pe..."
1,1218677328395538432,1218652137988730880,2020-01-18,23:31:31,175711368,trayntp,Biden is the More Effective Evil,@notonboard @GunnelsWarren @GetRealWBernie Not...,"[{'screen_name': 'politifact', 'name': 'politi...",2,0,1,['joebiden'],https://twitter.com/trayNTP/status/12186773283...,"[{'screen_name': 'notonboard', 'name': 'Maury ..."
2,1218675784912703488,1218252588371652608,2020-01-18,23:25:23,1184244724301008898,klima_jean,Christopher Tyler,@JoyceWhiteVance #Politico 1st laid out that #...,"[{'screen_name': 'democrats', 'name': 'democra...",0,0,0,"['politico', 'ukraine', 'trump', 'giuliani', '...",https://twitter.com/klima_jean/status/12186757...,"[{'screen_name': 'JoyceWhiteVance', 'name': 'J..."
3,1218675672844914688,1218673452690935808,2020-01-18,23:24:56,3418384394,edenfieldpl,CommunityCorner,@MonaSalama_ Looks like #Biden must be 1st Wit...,"[{'screen_name': 'whitehouse', 'name': 'the wh...",0,0,0,"['biden', 'china', 'iran', 'russia', 'ukraine']",https://twitter.com/EdenfieldPl/status/1218675...,"[{'screen_name': 'MonaSalama_', 'name': 'Mona ..."
4,1218674539934244864,1218665950532317184,2020-01-18,23:20:26,7334712,ctman1,Stephen BERNIE Herron,"@hollyotterbein Remember, the media hasn’t bee...","[{'screen_name': 'hollyotterbein', 'name': 'ho...",0,0,1,"['biden', 'bernie2020']",https://twitter.com/CtMan1/status/121867453993...,"[{'screen_name': 'hollyotterbein', 'name': 'Ho..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19653,1479552880310833153,1479524598991491072,2022-01-07,20:37:50,1365444820357967874,thekennykarma,💎 ✋ $inging Ape - Ξξ variant,@CharlieCrist @GovRonDeSantis So like a typica...,"[{'screen_name': 'potus', 'name': 'president b...",1,0,4,['biden'],https://twitter.com/thekennykarma/status/14795...,"[{'screen_name': 'CharlieCrist', 'name': 'Char..."
19654,1479539892648652800,1479256472630579200,2022-01-07,19:46:13,2825786045,runeaglesrun,Rick,@POTUS Coming from the person who authorized t...,"[{'screen_name': 'whcos', 'name': 'ronald klai...",0,0,0,"['biden', 'bidenspeech']",https://twitter.com/runeaglesrun/status/147953...,"[{'screen_name': 'POTUS', 'name': 'President B..."
19655,1479521180730146828,1479516741696995328,2022-01-07,18:31:52,531563448,katerina5k,🍊Katerina5k🍊MAGAGang🍊,"@TomKattman I may have stolen ""Polident #Biden...","[{'screen_name': 'greggutfeld', 'name': 'gregg...",1,0,1,['biden'],https://twitter.com/katerina5k/status/14795211...,"[{'screen_name': 'TomKattman', 'name': 'Tom Ka..."
19656,1479506373952032772,1479494789980561408,2022-01-07,17:33:02,75077990,chattahoocheejb,Jeff Barrett,@MarketRebels I know @MarketRebels only fancie...,"[{'screen_name': 'marketrebels', 'name': 'mark...",0,0,0,['joebiden'],https://twitter.com/ChattahoocheeJB/status/147...,"[{'screen_name': 'MarketRebels', 'name': 'Mark..."


## Network Preprocessing
* we preprocess the data in order to extract the relationships between users using the _reply_to_ feature
* then we create the edgelist and we assign to each interaction, a weight

In [36]:
df_definitivo['reply_to'] = df_definitivo['reply_to'].map(eval)

In [37]:
tupla = []

In [38]:
for index in range(0, len(df_definitivo)):
    for dizionario in df_definitivo['reply_to'][index]:
 #     print("Mittente ",df_definitivo['user_id'][index], "id ", dizionario['id'])
      tupla.append((df_definitivo['username'][index], df_definitivo["user_id"][index], dizionario['screen_name'], dizionario["id"]))
        


        
        
df_def = pd.DataFrame(tupla, columns = ["source", "id_source","target", "id_target"])

In [39]:
df_def

Unnamed: 0,source,id_source,target,id_target
0,pamnsc,738486583725305857,peterschweizer,62876780
1,trayntp,175711368,notonboard,69141933
2,trayntp,175711368,GunnelsWarren,2725669979
3,trayntp,175711368,GetRealWBernie,1187166484226891776
4,klima_jean,1184244724301008898,JoyceWhiteVance,548384458
...,...,...,...,...
38329,chattahoocheejb,75077990,MarketRebels,817007725666242561
38330,howserob,1267982563,lettywho1,1654442893
38331,howserob,1267982563,POTUS,1349149096909668363
38332,howserob,1267982563,WhiteHouse,1323730225067339784


In [40]:
df_def.to_csv("../data_collection/data/df_per_grafo.csv")

In [41]:
df_def = pd.read_csv("../data_collection/data/df_per_grafo.csv")


In [49]:
df_def

Unnamed: 0,source,id_source,target,id_target
0,pamnsc,738486583725305857,peterschweizer,62876780
1,trayntp,175711368,notonboard,69141933
2,trayntp,175711368,GunnelsWarren,2725669979
3,trayntp,175711368,GetRealWBernie,1187166484226891776
4,klima_jean,1184244724301008898,JoyceWhiteVance,548384458
...,...,...,...,...
38329,chattahoocheejb,75077990,MarketRebels,817007725666242561
38330,howserob,1267982563,lettywho1,1654442893
38331,howserob,1267982563,POTUS,1349149096909668363
38332,howserob,1267982563,WhiteHouse,1323730225067339784


In [50]:
del df_def["Unnamed: 0"]

KeyError: 'Unnamed: 0'

In [51]:
df_def_weighted = df_def.value_counts().to_frame("weights").reset_index() 
#il peso è dato da quante volte un nodo ha risposto a un altro nodo

In [52]:
df_def_weighted

Unnamed: 0,source,id_source,target,id_target,weights
0,scienceinvestme,135970150,JoeBiden,939091,60
1,elnurrik3,1172766864268832769,POTUS,1349149096909668363,40
2,dmaga101,1267802238874877953,JoeBiden,939091,40
3,slothsforme,3842652433,POTUS,1349149096909668363,30
4,natashaejs,1262462710568022017,ProjectLincoln,1205226529455632385,28
...,...,...,...,...,...
32434,goldbaron08,2661167788,CHIZMAGA,34756550,1
32435,goldbaron08,2661167788,1JohnEBravo,973674483151237121,1
32436,goinglikesixty,126478904,kaitlancollins,180107694,1
32437,goinglikesixty,126478904,carolstam7,326098537,1


* normalize the weight value

In [53]:
df_def_weighted["weights"] = (df_def_weighted["weights"] - (df_def_weighted["weights"]).min())/((df_def_weighted["weights"].max()-(df_def_weighted["weights"]).min() ))

In [142]:
df_def_weighted.to_csv("../data_collection/data_biden/df_per_grafo_pesato.csv")

## Data preprocessing for DCD

* we also prepared a dataset containing the quarter information


In [99]:
df_primo = pd.read_csv("../data_collection/data_biden/primo_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')
df_secondo = pd.read_csv("../data_collection/data_biden/secondo_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')
df_terzo = pd.read_csv("../data_collection/data_biden/terzo_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')
df_quarto = pd.read_csv("../data_collection/data_biden/quarto_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')
df_quinto = pd.read_csv("../data_collection/data_biden/quinto_semestre.csv", na_filter=True, na_values='[]',lineterminator='\n')




In [100]:
del df_primo["Unnamed: 0"]
del df_secondo["Unnamed: 0"]
del df_terzo["Unnamed: 0"]
del df_quarto["Unnamed: 0"]
del df_quinto["Unnamed: 0"]

In [101]:
df_definitivo_1 = df_primo.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)
df_definitivo_2 = df_secondo.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)
df_definitivo_3 = df_terzo.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)
df_definitivo_4 = df_quarto.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)
df_definitivo_5 = df_quinto.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)

In [102]:
df_definitivo_1.reset_index(inplace=True)
df_definitivo_2.reset_index(inplace=True)
df_definitivo_3.reset_index(inplace=True)
df_definitivo_4.reset_index(inplace=True)
df_definitivo_5.reset_index(inplace=True)

In [104]:
del df_definitivo_1["index"]
del df_definitivo_2["index"]
del df_definitivo_3["index"]
del df_definitivo_4["index"]
del df_definitivo_5["index"]

In [106]:
df_definitivo_1 = df_definitivo_1.dropna()
df_definitivo_2 = df_definitivo_2.dropna()
df_definitivo_3 = df_definitivo_3.dropna()
df_definitivo_4 = df_definitivo_4.dropna()
df_definitivo_5 = df_definitivo_5.dropna()

In [127]:
df_definitivo_1['reply_to'] = df_definitivo_1['reply_to'].map(eval)
df_definitivo_2['reply_to'] = df_definitivo_2['reply_to'].map(eval)
df_definitivo_3['reply_to'] = df_definitivo_3['reply_to'].map(eval)
df_definitivo_4['reply_to'] = df_definitivo_4['reply_to'].map(eval)
df_definitivo_5['reply_to'] = df_definitivo_5['reply_to'].map(eval)

In [128]:
df_definitivo_1.reset_index(inplace=True)
df_definitivo_2.reset_index(inplace=True)
df_definitivo_3.reset_index(inplace=True)
df_definitivo_4.reset_index(inplace=True)
df_definitivo_5.reset_index(inplace=True)

In [129]:
del df_definitivo_1["index"]
del df_definitivo_2["index"]
del df_definitivo_3["index"]
del df_definitivo_4["index"]
del df_definitivo_5["index"]

In [130]:
df_definitivo_1

Unnamed: 0,id,conversation_id,date,time,user_id,username,name,tweet,mentions,replies_count,retweets_count,likes_count,hashtags,link,reply_to
0,1218680146376429574,1218643531058884608,2020-01-18,23:42:43,738486583725305857,pamnsc,🇺🇸 Pamnsc 🇮🇱,@peterschweizer Hey @scdp @WhipClyburn @harri...,"[{'screen_name': 'scdp', 'name': 'south caroli...",0,1,0,"['biden', 'charleston']",https://twitter.com/pamnsc/status/121868014637...,"[{'screen_name': 'peterschweizer', 'name': 'Pe..."
1,1218677328395538432,1218652137988730880,2020-01-18,23:31:31,175711368,trayntp,Biden is the More Effective Evil,@notonboard @GunnelsWarren @GetRealWBernie Not...,"[{'screen_name': 'politifact', 'name': 'politi...",2,0,1,['joebiden'],https://twitter.com/trayNTP/status/12186773283...,"[{'screen_name': 'notonboard', 'name': 'Maury ..."
2,1218675784912703488,1218252588371652608,2020-01-18,23:25:23,1184244724301008898,klima_jean,Christopher Tyler,@JoyceWhiteVance #Politico 1st laid out that #...,"[{'screen_name': 'democrats', 'name': 'democra...",0,0,0,"['politico', 'ukraine', 'trump', 'giuliani', '...",https://twitter.com/klima_jean/status/12186757...,"[{'screen_name': 'JoyceWhiteVance', 'name': 'J..."
3,1218675672844914688,1218673452690935808,2020-01-18,23:24:56,3418384394,edenfieldpl,CommunityCorner,@MonaSalama_ Looks like #Biden must be 1st Wit...,"[{'screen_name': 'whitehouse', 'name': 'the wh...",0,0,0,"['biden', 'china', 'iran', 'russia', 'ukraine']",https://twitter.com/EdenfieldPl/status/1218675...,"[{'screen_name': 'MonaSalama_', 'name': 'Mona ..."
4,1218674539934244864,1218665950532317184,2020-01-18,23:20:26,7334712,ctman1,Stephen BERNIE Herron,"@hollyotterbein Remember, the media hasn’t bee...","[{'screen_name': 'hollyotterbein', 'name': 'ho...",0,0,1,"['biden', 'bernie2020']",https://twitter.com/CtMan1/status/121867453993...,"[{'screen_name': 'hollyotterbein', 'name': 'Ho..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6705,1264266633008361480,1263953059882905600,2020-05-23,19:47:08,2156028248,_not_relevant,Greg Carlsen,@davidnmartin @RexChapman That might be the BE...,"[{'screen_name': 'realdonaldtrump', 'name': 'd...",0,0,0,['biden'],https://twitter.com/_Not_Relevant/status/12642...,"[{'screen_name': 'davidnmartin', 'name': 'Davi..."
6706,1264266065523154946,1263997603693203456,2020-05-23,19:44:53,901935932122824705,daletanner18,1-65 trump'sCourtRecord,@BrunoAmato_1 Biden should be more deliberate ...,"[{'screen_name': 'joebiden', 'name': 'joe bide...",0,0,0,['joebiden'],https://twitter.com/DaleTanner18/status/126426...,"[{'screen_name': 'BrunoAmato_1', 'name': 'Brun..."
6707,1264265259361153025,1264208610671943680,2020-05-23,19:41:41,1257055044341555201,legs781724,legs412,@CJ4ShortPlz @PamEllaCasual @jeffsessions @jda...,"[{'screen_name': 'joebiden', 'name': 'joe bide...",1,0,0,"['dementiajoe', 'joebiden']",https://twitter.com/legs781724/status/12642652...,"[{'screen_name': 'PamEllaCasual', 'name': 'Pam..."
6708,1264264614516326402,1264256420951871488,2020-05-23,19:39:07,824378302693576704,1956again,Roxy Schwarz,@SadieTNResist Don't worry @realDonaldTrump . ...,"[{'screen_name': 'realdonaldtrump', 'name': 'd...",0,1,2,"['bluewave2020', 'votebluetosaveamerica', 'blu...",https://twitter.com/1956again/status/126426461...,"[{'screen_name': 'SadieTNResist', 'name': '👠💋 ..."


In [131]:
lista1 = []
lista2 = []
lista3 = []
lista4 = []
lista5 = []

In [136]:
for index in range(0, len(df_definitivo_1)):
    for dizionario in df_definitivo_1['reply_to'][index]:
        lista1.append((df_definitivo_1['username'][index], df_definitivo_1["user_id"][index], dizionario['screen_name'], dizionario["id"], 1))
        

        
df_primoSemestre = pd.DataFrame(lista1, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "semestre"])



In [137]:
for index in range(0, len(df_definitivo_2)):
    for dizionario in df_definitivo_2['reply_to'][index]:
        lista2.append((df_definitivo_2['username'][index], df_definitivo_2["user_id"][index], dizionario['screen_name'], dizionario["id"], 2))
        


df_secondoSemestre = pd.DataFrame(lista2, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "semestre"])

for index in range(0, len(df_definitivo_3)):
    for dizionario in df_definitivo_3['reply_to'][index]:
        lista3.append((df_definitivo_3['username'][index], df_definitivo_3["user_id"][index], dizionario['screen_name'], dizionario["id"], 3))
        

df_terzoSemestre = pd.DataFrame(lista3, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "semestre"])

for index in range(0, len(df_definitivo_4)):
    for dizionario in df_definitivo_4['reply_to'][index]:
        lista4.append((df_definitivo_4['username'][index], df_definitivo_4["user_id"][index], dizionario['screen_name'], dizionario["id"], 4))
        
df_quartoSemestre = pd.DataFrame(lista4, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "semestre"])


for index in range(0, len(df_definitivo_5)):
    for dizionario in df_definitivo_5['reply_to'][index]:
        lista5.append((df_definitivo_5['username'][index], df_definitivo_5["user_id"][index], dizionario['screen_name'], dizionario["id"], 5))
        
df_quintoSemestre = pd.DataFrame(lista5, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "semestre"])



In [138]:
df_primoSemestre_w = df_primoSemestre.value_counts().to_frame("weights").reset_index()
df_secondoSemestre_w = df_secondoSemestre.value_counts().to_frame("weights").reset_index()
df_terzoSemestre_w = df_terzoSemestre.value_counts().to_frame("weights").reset_index()
df_quartoSemestre_w = df_quartoSemestre.value_counts().to_frame("weights").reset_index()
df_quintoSemestre_w = df_quintoSemestre.value_counts().to_frame("weights").reset_index()

In [139]:
df_primoSemestre_w["weights"] = (df_primoSemestre_w["weights"] - (df_primoSemestre_w["weights"]).min())/((df_primoSemestre_w["weights"].max()-(df_primoSemestre_w["weights"]).min() ))
df_secondoSemestre_w["weights"] = (df_secondoSemestre_w["weights"] - (df_secondoSemestre_w["weights"]).min())/((df_secondoSemestre_w["weights"].max()-(df_secondoSemestre_w["weights"]).min() ))
df_terzoSemestre_w["weights"] = (df_terzoSemestre_w["weights"] - (df_terzoSemestre_w["weights"]).min())/((df_terzoSemestre_w["weights"].max()-(df_terzoSemestre_w["weights"]).min() ))
df_quartoSemestre_w["weights"] = (df_quartoSemestre_w["weights"] - (df_quartoSemestre_w["weights"]).min())/((df_quartoSemestre_w["weights"].max()-(df_quartoSemestre_w["weights"]).min() ))
df_quintoSemestre_w["weights"] = (df_quintoSemestre_w["weights"] - (df_quintoSemestre_w["weights"]).min())/((df_quintoSemestre_w["weights"].max()-(df_quintoSemestre_w["weights"]).min() ))



In [141]:
df_primoSemestre_w.to_csv("../data_collection/data_biden/df_primoSemestre.csv")
df_secondoSemestre_w.to_csv("../data_collection/data_biden/df_secondoSemestre.csv")
df_terzoSemestre_w.to_csv("../data_collection/data_biden/df_terzoSemestre.csv")
df_quartoSemestre_w.to_csv("../data_collection/data_biden/df_quartoSemestre.csv")
df_quintoSemestre_w.to_csv("../data_collection/data_biden/df_quintoSemestre.csv")