## Obtaining three samples from edges list and extracting nodes relevant to each sample
#### Our full nodes and edges data was having difficulty running in network visualizations so we sampled to solve that issue

In [1]:
# Imports
import pandas as pd
import numpy as np

## 1.) Obtaining three edges samples

In [2]:
# Load in the original edges list
Edges = pd.read_csv ('Mentions_Edges_1.csv')

# Renaming User Mentioned Column to be easier to call
column_name_update_map = {'User Mentioned':'User_Mentioned'}
Edges = Edges.rename(columns=column_name_update_map) 
Edges

# Some edges are duplicates, indicates that a user mentioned another user more than once in the dataset
# Not that the same edge is accounted for twice incorrectly

Unnamed: 0,User,User_Mentioned
0,102762759,36042554
1,102762759,65357102
2,102762759,65357102
3,1010056669156593664,932163222
4,2844670706,2844670706
...,...,...
24038,1149271028390146048,36042554
24039,937778257952432128,930741464
24040,12848262,23340760
24041,1015073514,10228272


In [3]:
# Shuffling the index of the Edges df to randomize the split, insert random_state so that the data doesn't shuffle and change every time
# Splitting the shuffed df into three equal sub-arrays, those sub-arrays are then assigned to three dataframes:
# Edges_S1, Edges_S2, and Edges_S3

shuffled = Edges.sample(frac=1, random_state=1)
Edges_S1, Edges_S2, Edges_S3 = np.array_split(shuffled, 3)

In [4]:
# First edges sample

Edges_S1

Unnamed: 0,User,User_Mentioned
6868,3008024567,10228272
3202,54192707,1004436543203930113
19756,201230538,36042554
5344,1123512298029510656,1579680481
20674,912680136725499904,36042554
...,...,...
2113,2363291286,36042554
8341,306610942,306610942
8018,961821847556800519,23742474
2595,773230604133564416,14224719


In [5]:
# Second edges sample

Edges_S2

Unnamed: 0,User,User_Mentioned
21829,24842995,19489239
22020,68479592,18464266
14196,124257401,177583133
1178,760240772256002048,1245843601390817281
317,1032905456967200769,2187982011
...,...,...
6428,1286364022405767170,10228272
11486,805529227147628544,1833919200
2632,1475418086,44900997
15080,1538898078391947265,20536157


In [6]:
# Third edges sample

Edges_S3

Unnamed: 0,User,User_Mentioned
1396,549443190,44384836
3802,1409077015680933889,125211600
15185,890895602,890895602
12860,1265294617852985344,36042554
11607,1430842471458738180,2812768561
...,...,...
10955,1527793969157087232,1369338106021896205
17289,1227815976441245696,10228272
5192,3342698680,3342698680
12172,1306180270383652865,36042554


In [7]:
# Resetting index for each sample's dataframe

Edges_S1 = Edges_S1.reset_index(drop=True)
Edges_S2 = Edges_S2.reset_index(drop=True)
Edges_S3 = Edges_S3.reset_index(drop=True)

In [8]:
# Exporting three edges samples to CSV files

# Edges_S1.to_csv('Sample_1_Edges.csv',encoding='utf-8-sig',index=False)
# Edges_S2.to_csv('Sample_2_Edges.csv',encoding='utf-8-sig',index=False)
# Edges_S3.to_csv('Sample_3_Edges.csv',encoding='utf-8-sig',index=False)

## 2.) Obtaining nodes list for each of the three edges samples

In [9]:
# Make User columns from each edge sample into lists

User_List_1 = Edges_S1.User.values.tolist()
User_List_2 = Edges_S2.User.values.tolist()
User_List_3 = Edges_S3.User.values.tolist()

In [10]:
# Make User Mentioned columns from each edge sample into lists

User_Mentioned_List_1 = Edges_S1.User_Mentioned.values.tolist()
User_Mentioned_List_2 = Edges_S2.User_Mentioned.values.tolist()
User_Mentioned_List_3 = Edges_S3.User_Mentioned.values.tolist()

In [11]:
# Load in the nodes list
# Original list Nodes_Details from cleaning was edited in Excel to include engagement classifications for the three metrics
# Formulas were used to classify each user's engagement level whether it be in replies, retweets, or likes

# Replies formula classified the following:
# 0 replies: "E"
# 1 - 100 replies: "D"
# 101 - 500 replies: "C"
# 501 - 1000 replies: "B"
# >1000 replies: "A"

# Retweets formula classified the following:
# 0 retweets: "E"
# 1 - 100 retweets: "D"
# 101 - 500 retweets: "C"
# 501 - 1000 retweets: "B"
# >1000 retweets: "A"

# Likes formula classified the following:
# 0 likes: "E"
# 1 - 100 likes: "D"
# 101 - 1000 likes: "C"
# 1001 - 10000 likes: "B"
# >10000 likes: "A"

#Nodes = pd.read_csv ('Nodes_Details_Categorized_1.csv')
Nodes = pd.read_csv ('Nodes_Details_Classifications.csv')
Nodes

Unnamed: 0,user_id,Screen Name,replies_count,retweets_count,likes_count,Replies_C,Retweets_C,Likes_C
0,1375691865324855302,st_matts_bells,0,1,4,E,D,D
1,1527887461271097344,mujangscare,0,0,1,E,E,D
2,1133815701448417280,history_kings,0,3,25,E,D,D
3,956670209695166464,kingsbench_ab,0,12,11,E,D,D
4,1549820637010706436,jonafirst_mays,0,0,0,E,E,E
...,...,...,...,...,...,...,...,...
153245,2358083856,teecaake,1,6,34,D,D,D
153246,2651962508,gentle84gentle,0,0,0,D,E,E
153247,993850951885869057,lauwrencegeysk1,0,0,0,D,E,E
153248,1551095715878866944,tceroth,0,0,1,D,E,D


In [12]:
# Pulling nodes records that are contained in the first column of the first sample of edges 

User_Nodes_S1_1 = Nodes[Nodes['user_id'].isin(User_List_1)]
User_Nodes_S1_1

Unnamed: 0,user_id,Screen Name,replies_count,retweets_count,likes_count,Replies_C,Retweets_C,Likes_C
1,1527887461271097344,mujangscare,0,0,1,E,E,D
2,1133815701448417280,history_kings,0,3,25,E,D,D
4,1549820637010706436,jonafirst_mays,0,0,0,E,E,E
6,880110959532138496,stbedes1957,0,2,20,E,D,D
49,42935483,_stephaniehaney,1,1,1,D,D,D
...,...,...,...,...,...,...,...,...
153039,54870643,alexcostantini,0,0,0,D,E,E
153078,3439161411,elmonumentalu,0,2,5,D,D,D
153132,1447189626460901384,candice_grasser,0,0,0,D,E,E
153174,1198622281448206337,yee_hawtie,0,1,6,D,D,D


In [13]:
# Pulling nodes records that are contained in the first column of the other two samples of edges

User_Nodes_S2_1 = Nodes[Nodes['user_id'].isin(User_List_2)]
User_Nodes_S3_1 = Nodes[Nodes['user_id'].isin(User_List_3)]

In [14]:
# Pulling nodes records that are contained in the second column of the first sample of edges

User_Nodes_S1_2 = Nodes[Nodes['user_id'].isin(User_Mentioned_List_1)]
User_Nodes_S1_2

Unnamed: 0,user_id,Screen Name,replies_count,retweets_count,likes_count,Replies_C,Retweets_C,Likes_C
1,1527887461271097344,mujangscare,0,0,1,E,E,D
4,1549820637010706436,jonafirst_mays,0,0,0,E,E,E
5,725349593752424450,wesleyschapel,0,6,9,E,D,D
6,880110959532138496,stbedes1957,0,2,20,E,D,D
9,5830042,73,0,0,0,E,E,E
...,...,...,...,...,...,...,...,...
9644,244234474,ziyandangcobo,0,0,0,D,E,E
9645,327114680,znsbahamas,0,0,0,D,E,E
9646,306108476,zoetheball,0,0,0,D,E,E
9650,949722068446167040,zorosconnection,0,0,0,D,E,E


In [15]:
# Pulling nodes records that are contained in the second column of the other two samples of edges

User_Nodes_S2_2 = Nodes[Nodes['user_id'].isin(User_Mentioned_List_2)]
User_Nodes_S3_2 = Nodes[Nodes['user_id'].isin(User_Mentioned_List_3)]

In [16]:
# Stacking two relevant nodes dataframes on top of one another for a complete list of nodes related to edges sample 1

User_Nodes_S1 = pd.concat([User_Nodes_S1_1, User_Nodes_S1_2], ignore_index=True, axis=0)
User_Nodes_S1

Unnamed: 0,user_id,Screen Name,replies_count,retweets_count,likes_count,Replies_C,Retweets_C,Likes_C
0,1527887461271097344,mujangscare,0,0,1,E,E,D
1,1133815701448417280,history_kings,0,3,25,E,D,D
2,1549820637010706436,jonafirst_mays,0,0,0,E,E,E
3,880110959532138496,stbedes1957,0,2,20,E,D,D
4,42935483,_stephaniehaney,1,1,1,D,D,D
...,...,...,...,...,...,...,...,...
9945,244234474,ziyandangcobo,0,0,0,D,E,E
9946,327114680,znsbahamas,0,0,0,D,E,E
9947,306108476,zoetheball,0,0,0,D,E,E
9948,949722068446167040,zorosconnection,0,0,0,D,E,E


In [17]:
# Stacking relevant nodes dataframes on top of one another for complete list of nodes corresponding to the second and third samples of edges

User_Nodes_S2 = pd.concat([User_Nodes_S2_1, User_Nodes_S2_2], ignore_index=True, axis=0)
User_Nodes_S3 = pd.concat([User_Nodes_S3_1, User_Nodes_S3_2], ignore_index=True, axis=0)

In [18]:
# Resetting index for list of nodes for the first sample of edges

User_Nodes_S1 = User_Nodes_S1.reset_index(drop=True)
User_Nodes_S1

Unnamed: 0,user_id,Screen Name,replies_count,retweets_count,likes_count,Replies_C,Retweets_C,Likes_C
0,1527887461271097344,mujangscare,0,0,1,E,E,D
1,1133815701448417280,history_kings,0,3,25,E,D,D
2,1549820637010706436,jonafirst_mays,0,0,0,E,E,E
3,880110959532138496,stbedes1957,0,2,20,E,D,D
4,42935483,_stephaniehaney,1,1,1,D,D,D
...,...,...,...,...,...,...,...,...
9945,244234474,ziyandangcobo,0,0,0,D,E,E
9946,327114680,znsbahamas,0,0,0,D,E,E
9947,306108476,zoetheball,0,0,0,D,E,E
9948,949722068446167040,zorosconnection,0,0,0,D,E,E


In [19]:
# Resetting index for list of nodes for the second and third sample of edges

User_Nodes_S2 = User_Nodes_S2.reset_index(drop=True)
User_Nodes_S3 = User_Nodes_S3.reset_index(drop=True)

In [20]:
# Dropping rows with duplicate user ids (could have been in first and second columns that were concatenated)

User_Nodes_S1 = User_Nodes_S1.drop_duplicates()
User_Nodes_S2 = User_Nodes_S2.drop_duplicates()
User_Nodes_S3 = User_Nodes_S3.drop_duplicates()

In [21]:
# Exporting the nodes corresponding to each edges sample to CSV files

#User_Nodes_S1.to_csv('Sample_1_Nodes.csv',encoding='utf-8-sig',index=False)
#User_Nodes_S2.to_csv('Sample_2_Nodes.csv',encoding='utf-8-sig',index=False)
#User_Nodes_S3.to_csv('Sample_3_Nodes.csv',encoding='utf-8-sig',index=False)