In [1]:
import pandas as pd
from itertools import chain
from collections import Counter
import networkx as nx
import numpy as np
import sklearn
import os, logging, re, gc, time
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from karateclub.community_detection.non_overlapping import GEMSEC, EdMot, SCD

In [120]:
# ---- read original file -------------
def process_raw_data(filesName, _path):
    data = pd.read_csv(_path+filesName+'.csv', index_col=False, dtype=str)
    # ====================
    # ----------- remain only english data & tweet+hashtag ------------
    data = data[data['lang'].apply(lambda x: x=='en')]
    logging.info("number of english data : ", str(len(data)))
    data_with_hashtag = data[data['hashtag'].apply(lambda x: type(x)==str and len(x)>2)]
    logging.info("number of eng+hashtag : ", str(len(data_with_hashtag)))
    del data
    # ====================
    # ---- remove unnecesery columns --------------
    data_with_hashtag.drop(['url', 'replyCount', 'lang','retweetCount', 'likeCount',
                            'quoteCount', 'conversationId','mentionedUsers', 'time'],axis=1, inplace=True)
    #=====================
    # ------------- preprocessing -----------------
    url_pattern = r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))';
    url_pattern2 = r'((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)'

    data_with_hashtag['cleaned'] = data_with_hashtag['content'].str.replace(r'@\w+', '')
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(url_pattern2, '')
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('#', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('()', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('(', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(')', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(':', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('\n', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('\r', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('!', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('&', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('*', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(r'\d+', ' ')
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('$', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('-', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(':', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(',', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(';', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace("'", ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(".", ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('"', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('  ', ' ', regex=False)
    data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace('  ', ' ', regex=False)
    # ===================
    data_with_hashtag.to_csv(_path+"cleaned_"+filesName+".csv", index=False)

In [3]:
def load_gensim_model():
    model = gensim.models.KeyedVectors.load_word2vec_format('/hdd/crawl-300d-2M.vec')
    return model

In [4]:
def cal_tweet_vec(main_df):
    tweet_vec = list()
    status_tweet = list()
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    for i in range(len(main_df)):
        tweet_matrix = list()
        tweet = main_df.iloc[i]['cleaned']

        tweet_tokens = tokenizer.tokenize(tweet)

        for word in tweet_tokens:
            if word not in stopwords_english:
                try:
                    tweet_matrix.append(model[word].tolist())
                except:
                    #print(word)
                    pass
        if len(tweet_matrix):
            tweet_vec.append(np.array(tweet_matrix).mean(axis=0))
            status_tweet.append("good")
        else:
            tweet_vec.append(-1)
            status_tweet.append("bad")
    
    return tweet_vec, status_tweet

In [5]:
def cal_min_max_median(val):
    ll = list()
    for i in range(val):
        for j in range(i+1, val):
            ll.append(dist[i,j])
    print("max : ",max(ll))
    print("min : ",min(ll))
    _median = np.median(np.array(ll))
    print("median : ",_median)
    return _median

In [6]:
def get_id_unique_hashtag(df):
    hashtags = list()
    hashtags_tmp = df['hashtag'].values.tolist()
    for i in hashtags_tmp:
        hashtags.append(i.replace(" ", "").split(","))
    hashtags = list(chain(*hashtags))
    print("all hashtags :", len(hashtags))

    unique_hashtag = list(set(hashtags))
    print("unique hashtags :", len(unique_hashtag))
    
    ids = df['id'].values.tolist()
    print("all ids:", len(ids))
    
    return ids, unique_hashtag

In [22]:
def generate_graph(Graph, ids,  unique_hashtag, df, i_iloc, j_iloc):
    #----------- add ids nodes -------------
    for _id in ids:
        Graph.add_node(_id)
    print("1- nodes : ", Graph.number_of_nodes())
#     --------- add hashtag nodes -----------
    for _hashtag in unique_hashtag:
        Graph.add_node(_hashtag)
    print("2- nodes : ", Graph.number_of_nodes())
#     --------- add edges between hashtag and id -----------
    for _, i in df.iterrows():
        for _h in i['hashtag'].replace(" ","").split(","):
            Graph.add_edge(i['id'], _h, weight=1) # ---------------------- < -----  hsahtag_weight=1
    print("just id-hashtag edges : ", Graph.number_of_edges())
#     --------- add edges between id and id with weight -----------
    f = open(_path_original_files + 'edges', 'w')
    for ii, jj in zip(i_iloc, j_iloc):
        if (ii != jj and jj < ii):
            print(translator[ii], translator[jj], dist[ii, jj], file=f)
            w = 2 - dist[ii, jj] # --------------------------------------- < ------ max_idid_weight=2
            Graph.add_edge(df.iloc[ii]['id'],df.iloc[jj]['id'],weight=w) 
    f.close()
    print("all edges : ", Graph.number_of_edges())
    print("isoated nodes : ", list(nx.isolates(Graph)))

In [8]:
# Function to sort the list of tuples by its second item <---* Update
def Sort_Tuple(tup):
    lst = len(tup)  
    for i in range(0, lst):  
        for j in range(0, lst-i-1):  
            if (tup[j][1] > tup[j + 1][1]):  
                temp = tup[j]  
                tup[j]= tup[j + 1]  
                tup[j + 1]= temp  
    return tup  

In [9]:
_path_original_files = '/home/mahdi/Desktop/nc-final-project/files_twitter/'
fileNames = ["near_washington_1015_1016_tweets", "near_washington_1016_1017_tweets",
             "near_washington_1017_1018_tweets", "near_washington_1018_1019_tweets",
             "near_washington_1019_1022_tweets"]
print("Cleaning "+str(len(fileNames))+" files ... ", end=" ")
for f in fileNames[:1]:
    process_raw_data(f, _path_original_files)
print("done")
model = load_gensim_model()
nltk.download('stopwords')
stopwords_english = set(stopwords.words('english'))
print(stopwords_english)

Cleaning 5 files ...  

  data_with_hashtag['cleaned'] = data_with_hashtag['content'].str.replace(r'@\w+', '')
  data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(url_pattern2, '')
  data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(r'\d+', ' ')


done
{'hadn', 't', 'this', 'shan', 'your', 'through', 'how', 'these', 'or', 'same', 'under', 'out', 'own', 'for', "wasn't", 'am', 'at', 'so', 'yours', 'had', 'whom', 'ma', 'should', 'doesn', "you'll", 's', "weren't", 'wasn', 'don', 'being', "couldn't", 'both', 'an', 'been', 've', 'in', 'it', 'themselves', 'again', 'he', 'which', 'has', 'theirs', "isn't", "hadn't", 'itself', 'of', 'there', "needn't", 'more', 'here', 'from', 'is', 'once', 'all', 'after', "she's", 'herself', 'until', 'his', "doesn't", 'mightn', "you'd", 'than', 'no', 'further', 'each', 'why', 'other', 'against', 'such', 'its', 'when', 'can', 'd', 'a', 'only', 'not', 'nor', 'what', 'because', 'by', 'just', "mightn't", 'having', 'their', 're', 'few', "don't", 'were', 'ain', 'most', 'during', 'myself', 'about', "mustn't", 'her', 'above', "you've", 'ourselves', 'that', 'then', 'while', 'hers', 'do', 'yourself', 'but', 'as', 'into', "aren't", "shan't", 'won', 'couldn', 'didn', 'have', "it's", 'too', 'was', 'if', 'o', 'yourselv

[nltk_data] Downloading package stopwords to /home/mahdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [123]:
for f in fileNames[:1]:
    process_raw_data(f, _path_original_files)

  data_with_hashtag['cleaned'] = data_with_hashtag['content'].str.replace(r'@\w+', '')
  data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(url_pattern2, '')
  data_with_hashtag['cleaned'] = data_with_hashtag['cleaned'].str.replace(r'\d+', ' ')


In [125]:
for _,_, f in os.walk(_path_original_files):
    pass
f = [f'cleaned_{i}.csv' for i in fileNames]
f = ['cleaned_near_washington_1015_1016_tweets.csv']

In [11]:
#fileName = 'cleaned_near_washington_1018_1019_tweets.csv'
print(f)
fileName = f[0]
# for fileName in f:
print(10*"=.=")
print(fileName)
main_df = pd.read_csv(_path_original_files+fileName, index_col=False, dtype=str)

grp_maindf = main_df.groupby('date')
_, idate = list(grp_maindf)[0]
# for _, idate in grp_maindf:
t0 = time.time()
print(10*'-.-.')
df = idate
#df = main_df
df = df.reset_index(drop=True)
_date = df.iloc[0]['date']
print(df.shape[0])
all_tweet_vec, all_status_tweet = cal_tweet_vec(df)
_df_tmp = pd.DataFrame()
_df_tmp['vec'] = all_tweet_vec
_df_tmp['status'] = all_status_tweet
tweet_vec = _df_tmp[_df_tmp['status']=='good']['vec'].values.tolist()
dist = sklearn.metrics.pairwise_distances(tweet_vec, tweet_vec, n_jobs=8)
dist = np.array(dist)
_median = cal_min_max_median(round(dist.shape[0]/2))
#i_iloc, j_iloc = np.where(dist<_median/4)
i_iloc, j_iloc = np.where(dist<_median/3)
# i_iloc, j_iloc = np.where(dist<_median/2)
# i_iloc, j_iloc = np.where(dist<_median)
dfgood = df[_df_tmp['status']=='good']


['cleaned_near_washington_1015_1016_tweets.csv']
=.==.==.==.==.==.==.==.==.==.=
cleaned_near_washington_1015_1016_tweets.csv
-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.
33074
max :  8.638947300732344
min :  0.0
median :  2.132292216232245


In [145]:
translator = {}
df_all = pd.read_csv(_path_original_files+fileName[len('cleaned_'):], index_col=False, dtype=str)
for i in range(len(dfgood)):
    translator[i] = int(dfgood.iloc[i]['Unnamed: 0'])
edges_original = {}
for ii, jj in zip(i_iloc, j_iloc):
    if ii == jj:
        continue
    a = translator[int(ii)]
    b = translator[int(jj)]
    if a > b:
        a, b = b, a
    edges_original[(a, b)] = dist[ii, jj]
            
edges_cpp = {(int(line.split()[0]), int(line.split()[1])): float(line.split()[2]) for line in open(_path_original_files + 'edges_c').readlines()}
edges_removed = set(edges_original) - set(edges_cpp)
edges_added = set(edges_cpp) - set(edges_original)
print(len(edges_removed), 'edges removed')
print(len(edges_added), 'edges added')
eps = 1e-3
weight_different = [k for k in (set(edges_cpp).intersection(edges_original)) if abs(edges_cpp[k] - edges_original[k]) > eps][1:]
print(len(weight_different), 'edges have different weight')
print(len(set(edges_cpp).intersection(edges_original)), 'common')


#del model, _df_tmp, tweet_vec, all_status_tweet, all_tweet_vec, df
#gc.collect()

89 edges removed
117 edges added
104 edges different weight
8329 common
