# 1. Install and import necessary libraries :  

In [71]:
!pip install nltk
!pip install networkx



In [72]:
import pandas as pd
import re
import string
import nltk
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import itertools
import networkx as nx

nltk.download('punkt')  # Required for wordpunct_tokenize
nltk.download('averaged_perceptron_tagger')  # POS tagger
nltk.download('wordnet')  # WordNet for lemmatization
nltk.download('omw-1.4')  # WordNet's dependencies
nltk.download('words')  # For the word corpus
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

words = set(nltk.corpus.words.words())
stop_words = nltk.corpus.stopwords.words("english")
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tweet_tokenizer = nltk.TweetTokenizer()

[nltk_data] Downloading package punkt to C:\Users\sina
[nltk_data]     tavakoli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sina tavakoli\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\sina
[nltk_data]     tavakoli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\sina
[nltk_data]     tavakoli\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to C:\Users\sina
[nltk_data]     tavakoli\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\sina
[nltk_data]     tavakoli\AppData\Roaming\nltk_data...
[nltk_data]   Pack

# 2. Importing data : 

In [73]:
tweets_df = pd.DataFrame(pd.read_csv('D:/other/job/students_project/network_science/TA project/analyzing-Ukraine-war/ukrainewar_full.csv'))
tweets_df.columns

Index(['Unnamed: 0', 'subreddit', 'selftext', 'author_fullname', 'title',
       'upvote_ratio', 'ups', 'created', 'created_utc', 'num_comments',
       'author', 'id'],
      dtype='object')

In [74]:
tweets_filtered = tweets_df.copy() #it's a good idea to work on the copy of original dataframe, so we can always go back to it if we mess something up
column_list = ["id","author","subreddit","title","selftext", "upvote_ratio","num_comments"]
tweets_filtered = tweets_filtered[column_list]

In [75]:
tweets_filtered.dropna(inplace=True)
tweets_filtered.reset_index(inplace=True,drop=True)
tweets_filtered

Unnamed: 0,id,author,subreddit,title,selftext,upvote_ratio,num_comments
0,12aw2q2,ModeratorsOfEurope,europe,War in Ukraine Megathread LIII,\nThis megathread is meant for discussion of t...,0.95,8232
1,10eps9y,ModeratorsOfEurope,europe,War in Ukraine Megathread L,This megathread is meant for discussion of the...,0.96,9524
2,119wltg,ModeratorsOfEurope,europe,War in Ukraine Megathread LII,"This is a special megathread. **One year ago, ...",0.97,8276
3,z3mb0m,BackgroundGold3503,Cursedgunimages,Wtf,What the hell is this russian creation !! #uk...,0.97,15
4,14oijq5,ModeratorsOfEurope,europe,War in Ukraine Megathread LV (55),\nThis megathread is meant for discussion of t...,0.94,4457
...,...,...,...,...,...,...,...
4132,1182qiw,mdkss12,caps,Lucky Guess - Game 59: vs DET,This team without Ovi is *rough*. we'll see ho...,0.75,32
4133,zy6szr,mdkss12,caps,Lucky Guess - Game 37: vs OTT,"Shutout! you love to see it, especially in a d...",0.67,42
4134,12hh318,mdkss12,caps,Lucky Guess - Game 80: vs NYI - Blunder for Be...,Keep on losing and try to move up the draft lo...,0.64,29
4135,11dlwx9,liberty_ukraine,u_liberty_ukraine,Liberty Ukraine in Action!,\n\nLiberty Ukraine in Action! Thank you for ...,1.00,0


# 3. Cleaning text : 

In [76]:
def cleaner(tweet):
    try:
        tweet = re.sub("@[A-Za-z0-9]+","",tweet) # remove mentions
        tweet = re.sub("#[A-Za-z0-9]+", "",tweet) # remove hashtags
        tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
        tweet = " ".join(tweet.split())
        tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) if w.lower() in words and not w.lower() in stop_words)  # remove stop words
        lemma_function = WordNetLemmatizer()
        tweet = " ".join(lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(tweet))) #lemmatize
        tweet = str.lower(tweet) #to lowercase
    except Exception as e:
        print(f"Error processing tweet: {tweet}, error: {e}")
        return ""
    return tweet


In [77]:
tweets_filtered["clean_text"] = tweets_filtered["selftext"].map(cleaner)

In [98]:
tweets_filtered
tweets_filtered.to_csv("D:/other/Mannheim university/Data Mining/final project/additional files/text_filtered.csv")

# 4. Extracting words : 

In [79]:
tweets_filtered.loc[tweets_filtered["clean_text"].isnull(),"clean_text"] = ""

In [80]:
unique_words = {}

for idx, row in tweets_filtered.iterrows():
  if row["clean_text"] != "":
    for word in tweet_tokenizer.tokenize(row["clean_text"]):
      unique_words.setdefault(word,0)
      unique_words[word] += 1

In [81]:
uw_df = pd.DataFrame.from_dict(unique_words, orient='index').reset_index()
uw_df.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
uw_df.sort_values(by=['Count'], ascending=False, inplace=True)
uw_df = uw_df.reset_index().drop(columns=["index"])

In [82]:
uw_df

Unnamed: 0,Word,Count
0,team,9675
1,go,7579
2,scorer,6072
3,game,5574
4,season,5530
...,...,...
1078,criticism,41
1079,praise,41
1080,issue,41
1081,united,41


In [83]:
uw_df.to_csv("D:/other/Mannheim university/Data Mining/final project/additional files/words.csv")

# 5. Extracting the edges : 

### Step 4: Building the network

We are going to use the networkx library, which is a Python library that enables network science analysis of the data.

We are going to use it to create our network and extract edgelist from it, since we can easily import it to Gephi (a software we are going to see in visualization labs).

However, it offers implemented algorithms for analysis (for example PageRank) that you can use out-of-box to analyze your network.

But first, we will loop through our dataframe and connect words and hashtags if they appear together in the same Tweet.

In [84]:
uw = unique_words.keys()  

In [85]:
network = {}
network_key = 0
for index, row in tweets_filtered.iterrows():
    combined_list = [word for word in str.split(row["clean_text"], " ") if word in uw]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1 
    
network_df = pd.DataFrame.from_dict(network, orient="index")

In [93]:
network_df.reset_index(inplace=True,drop=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df

Unnamed: 0,pair,weight
0,"(also, link)",22344
1,"(go, team)",21487
2,"(war, also)",21246
3,"(team, scorer)",18810
4,"(also, r)",17724
...,...,...
77680,"(nice, update)",41
77679,"(nice, bonus)",41
77678,"(nice, world)",41
77677,"(nice, cup)",41


In [94]:

up_weighted = []
for edge in network:

    up_weighted.append((edge[0],edge[1],network[edge]))

G = nx.Graph()
G.add_weighted_edges_from(up_weighted)

In [95]:
print(len(G.nodes()))
print(len(G.edges()))

1083
77722


# 6.  Save edgelist : 

In [96]:
filename = "D:/other/Mannheim university/Data Mining/final project/additional files/edgelist.csv"

In [97]:
nx.write_weighted_edgelist(G, filename, delimiter=",")

In [91]:
network_df.to_csv("D:/other/Mannheim university/Data Mining/final project/additional files/edgelist.csv")

# 7. Create and save node list :


In [92]:
word_nodes = pd.DataFrame.from_dict(unique_words,orient="index")
word_nodes.reset_index(inplace=True)
word_nodes["Label"] = word_nodes["index"]
word_nodes.rename(columns={"index":"Id",0:"delete"},inplace=True)
word_nodes = word_nodes.drop(columns=['delete'])

word_nodes
filename = "D:/other/Mannheim university/Data Mining/final project/additional files/nodes.csv"
word_nodes.to_csv("D:/other/Mannheim university/Data Mining/final project/additional files/nodes.csv")